miga-base 0.7.7.0 → 0.7.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3682f50e3efe936ce751cd83cc7945edddb8e1c3ea6e654c4d54f8ea79efbfcb
4
- data.tar.gz: a5bc821d8f1b6f55baf495eea28e8783c86c61ffaca2d486e4589b818a60038f
3
+ metadata.gz: 7b4a168130d732c670246cd4a874272e77e5f7d88fdef00e10d81ab8e5f9979a
4
+ data.tar.gz: '069e2dd280b4afecb67478612f1dee35bf2cada3ae57cbc61c6e70d0ef3bd233'
5
5
  SHA512:
6
- metadata.gz: 23e986949f97ae31498b7310eba666f0fc4b5f3e4ab9d38a135b2934db901449dca70ce74830e4353bb60f2196ce2c195b1bfb20400f884494e9766e58ea5214
7
- data.tar.gz: 3857008111b8a65b1fbf09442eb3a657789ebf964769c7805485be57298141c363296b7d2491ef3379726344f07892211d10d4c72fc74a8095bc0eaf00d4e873
6
+ metadata.gz: a37fd7d69339c7a63d5ac38e0c232fed96d479c3f2f2bc67b2ee956bb908d8690a55f21a6fa0185c05f209139e16b6b2ddcd6b0f36fac471f9e0b4fd2c4a5f04
7
+ data.tar.gz: cb156656c79f1a765281f163691650e32f90056dc767827d4b3fe958b4042e6359b810f5a3249c6902bce042f6a457507a027836797c39328889d9cbbbc5c5d0
data/README.md CHANGED
@@ -41,6 +41,7 @@ Developed and maintained by [Luis M. Rodriguez-R][lrr]. MiGA is the result of a
41
41
  collaboration between [Kostas Lab][kostas] at the Georgia Institute of
42
42
  Technology and [RDP][rdp] at Michigan State University.
43
43
 
44
+ See also the [complete list of contributors](manual/part1/contributors.md).
44
45
 
45
46
  # License
46
47
 
@@ -42,7 +42,9 @@ class MiGA::Cli::Action::ClassifyWf < MiGA::Cli::Action
42
42
  '--no-summaries',
43
43
  'Do not generate intermediate step summaries'
44
44
  ) { |v| cli[:summaries] = v }
45
- opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
45
+ opts_for_wf(
46
+ opt, 'Input genome assemblies (nucleotides, FastA)', qual: false
47
+ )
46
48
  end
47
49
  end
48
50
 
@@ -19,6 +19,10 @@ class MiGA::Cli::Action::DerepWf < MiGA::Cli::Action
19
19
  'Use Average Amino Acid Identity (AAI) as genome similarity metric',
20
20
  'By default: Use Average Nucleotide Identity (ANI)'
21
21
  ) { cli[:metric] = :aai }
22
+ opt.on(
23
+ '--ani',
24
+ 'Use Average Nucleotide Identity (ANI) as similarity metric (default)'
25
+ ) { cli[:metric] = :ani }
22
26
  opt.on(
23
27
  '--threshold FLOAT', Float,
24
28
  "Metric threshold (%) to dereplicate. By default: #{cli[:threshold]}"
@@ -17,18 +17,21 @@ class MiGA::Cli::Action::Edit < MiGA::Cli::Action
17
17
  'Activate dataset; requires -D'
18
18
  ) { |v| cli[:activate] = v }
19
19
  opt.on(
20
- '--inactivate',
21
- 'Inactivate dataset; requires -D'
22
- ) { |v| cli[:activate] = !v }
20
+ '--inactivate [reason]',
21
+ 'Inactivate dataset; requires -D',
22
+ 'The argument is optional: reason to inactivate dataset'
23
+ ) { |v| cli[:activate] = false ; cli[:reason] = v }
23
24
  end
24
25
  end
25
26
 
26
27
  def perform
27
28
  obj = cli.load_project_or_dataset
28
29
  unless cli[:activate].nil?
29
- cli.ensure_par({ dataset: '-D' },
30
- '%<name>s is mandatory with --[in-]activate: please provide %<flag>s')
31
- cli[:activate] ? obj.activate! : obj.inactivate!
30
+ cli.ensure_par(
31
+ { dataset: '-D' },
32
+ '%<name>s is mandatory with --[in-]activate: please provide %<flag>s'
33
+ )
34
+ cli[:activate] ? obj.activate! : obj.inactivate!(cli[:reason])
32
35
  end
33
36
  cli.add_metadata(obj)
34
37
  obj.save
@@ -15,7 +15,10 @@ class MiGA::Cli::Action::QualityWf < MiGA::Cli::Action
15
15
  '-m', '--mytaxa-scan',
16
16
  'Perform MyTaxa scan analysis'
17
17
  ) { |v| cli[:mytaxa] = v }
18
- opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
18
+ opts_for_wf(
19
+ opt, 'Input genome assemblies (nucleotides, FastA)',
20
+ qual: false
21
+ )
19
22
  end
20
23
  end
21
24
 
@@ -14,7 +14,7 @@ class MiGA::Cli::Action::Stats < MiGA::Cli::Action
14
14
  ) { |v| cli[:key] = v }
15
15
  opt.on(
16
16
  '--compute-and-save',
17
- 'Compute and saves the statistics'
17
+ 'Compute and save the statistics'
18
18
  ) { |v| cli[:compute] = v }
19
19
  opt.on(
20
20
  '--try-load',
@@ -38,7 +38,7 @@ class MiGA::Cli::Action::Stats < MiGA::Cli::Action
38
38
  end
39
39
  if cli[:key].nil?
40
40
  r[:stats].each do |k, v|
41
- k_n = k == :g_c_content ? 'G+C content' : k.to_s.unmiga_name.capitalize
41
+ k_n = k.to_s.unmiga_name.sub(/^./, &:upcase)
42
42
  cli.puts "#{k_n}: #{v.is_a?(Array) ? v.join(' ') : v}"
43
43
  end
44
44
  else
@@ -26,6 +26,10 @@ class MiGA::Cli::Action::Summary < MiGA::Cli::Action
26
26
  '--with-units',
27
27
  'Include units in each cell'
28
28
  ) { |v| cli[:units] = v }
29
+ opt.on(
30
+ '--compute-and-save',
31
+ 'Compute and save the statistics if not yet available'
32
+ ) { |v| cli[:compute] = v }
29
33
  end
30
34
  end
31
35
 
@@ -34,7 +38,8 @@ class MiGA::Cli::Action::Summary < MiGA::Cli::Action
34
38
  ds = cli.load_and_filter_datasets
35
39
  cli.say 'Loading results'
36
40
  stats = ds.map do |d|
37
- r = d.add_result(cli[:result].to_sym, false)
41
+ r = d.result(cli[:result])
42
+ r.compute_stats if cli[:compute] && !r.nil? && r[:stats].empty?
38
43
  s = r.nil? ? {} : r[:stats]
39
44
  s.tap { |i| i[:dataset] = d.name }
40
45
  end
@@ -15,7 +15,7 @@ module MiGA::Cli::Action::Wf
15
15
 
16
16
  def opts_for_wf(opt, files_desc, params = {})
17
17
  {
18
- multi: false, cleanup: true, project_type: false, ncbi: true
18
+ multi: false, cleanup: true, project_type: false, ncbi: true, qual: true
19
19
  }.each { |k, v| params[k] = v if params[k].nil? }
20
20
  opt.on(
21
21
  '-o', '--out_dir PATH',
@@ -40,6 +40,13 @@ module MiGA::Cli::Action::Wf
40
40
  'Only download complete genomes, not drafts'
41
41
  ) { |v| cli[:ncbi_draft] = v }
42
42
  end
43
+ if params[:qual]
44
+ opt.on(
45
+ '--min-qual FLOAT', Float,
46
+ 'Minimum genome quality to include in analysis',
47
+ 'By default: 50.0'
48
+ ) { |v| cli[:min_qual] = v }
49
+ end
43
50
  if params[:cleanup]
44
51
  opt.on(
45
52
  '-c', '--clean',
@@ -89,6 +96,10 @@ module MiGA::Cli::Action::Wf
89
96
  end
90
97
 
91
98
  def opts_for_wf_distances(opt)
99
+ opt.on('--sensitive', 'Alias to: --aai-p blast+ --ani-p blast+') do
100
+ cli[:aai_p] = 'blast+'
101
+ cli[:ani_p] = 'blast+'
102
+ end
92
103
  opt.on('--fast', 'Alias to: --aai-p diamond --ani-p fastani') do
93
104
  cli[:aai_p] = 'diamond'
94
105
  cli[:ani_p] = 'fastani'
@@ -121,7 +132,7 @@ module MiGA::Cli::Action::Wf
121
132
  ]) unless MiGA::Project.exist? cli[:outdir]
122
133
  # Define project metadata
123
134
  p = cli.load_project(:outdir, '-o')
124
- [:haai_p, :aai_p, :ani_p, :ess_coll].each { |i| p_metadata[i] = cli[i] }
135
+ %i[haai_p aai_p ani_p ess_coll min_qual].each { |i| p_metadata[i] = cli[i] }
125
136
  p_metadata[:type] = cli[:project_type]
126
137
  transfer_metadata(p, p_metadata)
127
138
  # Download datasets
@@ -155,7 +166,7 @@ module MiGA::Cli::Action::Wf
155
166
  '-P', cli[:outdir],
156
167
  '-r', r,
157
168
  '-o', File.expand_path("#{r}.tsv", cli[:outdir]),
158
- '--tab'
169
+ '--tab', '--ref', '--active'
159
170
  ])
160
171
  end
161
172
  end
@@ -68,15 +68,20 @@ module MiGA::Common::Format
68
68
  # a FastA or FastQ file (supports gzipped files). The +format+ must be a
69
69
  # Symbol, one of +:fasta+ or +:fastq+. Additional estimations can be
70
70
  # controlled via the +opts+ Hash. Supported options include:
71
- # - +:n50+: If true, it also returns the N50 and the median (in bp)
72
- # - +:gc+: If true, it also returns the G+C content (in %)
73
- # - +:x+: If true, it also returns the undetermined bases content (in %)
71
+ # - +:n50+: Include the N50 and the median (in bp)
72
+ # - +:gc+: Include the G+C content (in %)
73
+ # - +:x+: Include the undetermined bases content (in %)
74
+ # - +:skew+: Include G-C and A-T sequence skew (in %; forces gc: true).
75
+ # See definition used here in DOI:10.1177/117693430700300006
74
76
  def seqs_length(file, format, opts = {})
77
+ opts[:gc] = true if opts[:skew]
75
78
  fh = file =~ /\.gz/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
76
79
  l = []
77
80
  gc = 0
78
81
  xn = 0
79
- i = 0 # <- Zlib::GzipReader doesn't set `$.`
82
+ t = 0
83
+ c = 0
84
+ i = 0 # <- Zlib::GzipReader doesn't set `$.`
80
85
  fh.each_line do |ln|
81
86
  i += 1
82
87
  if (format == :fasta and ln =~ /^>/) or
@@ -86,16 +91,27 @@ module MiGA::Common::Format
86
91
  l[l.size - 1] += ln.chomp.size
87
92
  gc += ln.scan(/[GCgc]/).count if opts[:gc]
88
93
  xn += ln.scan(/[XNxn]/).count if opts[:x]
94
+ if opts[:skew]
95
+ t += ln.scan(/[Tt]/).count
96
+ c += ln.scan(/[Cc]/).count
97
+ end
89
98
  end
90
99
  end
91
100
  fh.close
92
101
 
93
- o = { n: l.size, tot: l.inject(:+), max: l.max }
102
+ o = { n: l.size, tot: l.inject(0, :+), max: l.max }
103
+ return o if o[:tot].zero?
94
104
  o[:avg] = o[:tot].to_f / l.size
95
105
  o[:var] = l.map { |a| a**2 }.inject(:+).to_f / l.size - o[:avg]**2
96
106
  o[:sd] = Math.sqrt o[:var]
97
107
  o[:gc] = 100.0 * gc / o[:tot] if opts[:gc]
98
108
  o[:x] = 100.0 * xn / o[:tot] if opts[:x]
109
+ if opts[:skew]
110
+ at = o[:tot] - gc
111
+ o[:at_skew] = 100.0 * (2 * t - at) / at
112
+ o[:gc_skew] = 100.0 * (2 * c - gc) / gc
113
+ end
114
+
99
115
  if opts[:n50]
100
116
  l.sort!
101
117
  thr = o[:tot] / 2
@@ -106,7 +122,8 @@ module MiGA::Common::Format
106
122
  break if pos >= thr
107
123
  end
108
124
  o[:med] = o[:n].even? ?
109
- 0.5 * l[o[:n] / 2 - 1, 2].inject(:+) : l[(o[:n] - 1) / 2]
125
+ 0.5 * l[o[:n] / 2 - 1, 2].inject(:+) :
126
+ l[(o[:n] - 1) / 2]
110
127
  end
111
128
  o
112
129
  end
@@ -130,9 +147,14 @@ class String
130
147
  end
131
148
 
132
149
  ##
133
- # Replace underscores by spaces or dots (depending on context).
150
+ # Replace underscores by spaces or other symbols depending on context
134
151
  def unmiga_name
135
- gsub(/_(str|sp|subsp|pv)__/, '_\\1._').tr('_', ' ')
152
+ gsub(/_(str|sp|subsp|pv)__/, '_\\1._')
153
+ .gsub(/g_c_(content)/, 'G+C \\1')
154
+ .gsub(/g_c_(skew)/, 'G-C \\1')
155
+ .gsub(/a_t_(skew)/, 'A-T \\1')
156
+ .gsub(/x_content/, &:capitalize)
157
+ .tr('_', ' ')
136
158
  end
137
159
 
138
160
  ##
@@ -72,6 +72,7 @@ class MiGA::Daemon < MiGA::MiGA
72
72
  say '-----------------------------------'
73
73
  say 'MiGA:%s launched' % project.name
74
74
  say '-----------------------------------'
75
+ recalculate_status!
75
76
  load_status
76
77
  say 'Configuration options:'
77
78
  say @runopts.to_s
@@ -99,6 +100,7 @@ class MiGA::Daemon < MiGA::MiGA
99
100
  end
100
101
 
101
102
  def recalculate_status!
103
+ say 'Recalculating status for all datasets'
102
104
  project.each_dataset(&:recalculate_status)
103
105
  end
104
106
 
@@ -158,8 +160,8 @@ class MiGA::Daemon < MiGA::MiGA
158
160
  end
159
161
 
160
162
  ##
161
- # Traverse datasets, and returns boolean indicating if at any datasets
162
- # are incomplete
163
+ # Traverse datasets, and returns boolean indicating if at any reference
164
+ # datasets are incomplete
163
165
  def check_datasets
164
166
  l_say(2, 'Checking datasets')
165
167
  o = false
@@ -167,7 +169,7 @@ class MiGA::Daemon < MiGA::MiGA
167
169
  next unless ds.status == :incomplete
168
170
  next if ds.next_preprocessing(false).nil?
169
171
 
170
- o = true
172
+ o = true if ds.ref?
171
173
  queue_job(:d, ds)
172
174
  end
173
175
  o
@@ -183,7 +185,7 @@ class MiGA::Daemon < MiGA::MiGA
183
185
  return if project.dataset_names.empty?
184
186
 
185
187
  # Double-check if all datasets are ready
186
- return unless project.done_preprocessing?(false)
188
+ return unless project.done_preprocessing?
187
189
 
188
190
  # Queue project-level job
189
191
  to_run = project.next_task(nil, false)
@@ -97,7 +97,10 @@ class MiGA::Dataset < MiGA::MiGA
97
97
 
98
98
  ##
99
99
  # Inactivate a dataset. This halts automated processing by the daemon
100
- def inactivate!
100
+ #
101
+ # If given, the +reason+ string is saved as a metadata +:warn+ entry
102
+ def inactivate!(reason = nil)
103
+ metadata[:warn] = "Inactive: #{reason}" unless reason.nil?
101
104
  metadata[:inactive] = true
102
105
  metadata.save
103
106
  pull_hook :on_inactivate
@@ -107,6 +110,7 @@ class MiGA::Dataset < MiGA::MiGA
107
110
  # Activate a dataset. This removes the +:inactive+ flag
108
111
  def activate!
109
112
  metadata[:inactive] = nil
113
+ metadata[:warn] = nil if metadata[:warn] && metadata[:warn] =~ /^Inactive: /
110
114
  metadata.save
111
115
  pull_hook :on_activate
112
116
  end
@@ -35,8 +35,8 @@ module MiGA::Dataset::Base
35
35
  mytaxa: '07.annotation/02.taxonomy/01.mytaxa',
36
36
  mytaxa_scan: '07.annotation/03.qa/02.mytaxa_scan',
37
37
  # Distances (for single-species datasets)
38
- distances: '09.distances',
39
38
  taxonomy: '09.distances/05.taxonomy',
39
+ distances: '09.distances',
40
40
  # General statistics
41
41
  stats: '90.stats'
42
42
  }
@@ -52,15 +52,15 @@ module MiGA::Dataset::Hooks
52
52
  end
53
53
 
54
54
  ##
55
- # Run +cmd+ in the command-line with {{variables}}: dataset, project, miga,
56
- # object (as defined for the event, if any)
55
+ # Run +cmd+ in the command-line with {{variables}}:
56
+ # dataset, project, project_name, miga, object (if defined for the event)
57
57
  # - +hook_args+: +[cmd]+
58
58
  # - +event_args+: +[object (optional)]+
59
59
  def hook_run_cmd(hook_args, event_args)
60
60
  Process.wait(
61
61
  spawn hook_args.first.miga_variables(
62
- dataset: name, project: project.path, miga: MiGA::MiGA.root_path,
63
- object: event_args.first
62
+ dataset: name, project: project.path, project_name: project.name,
63
+ miga: MiGA::MiGA.root_path, object: event_args.first
64
64
  )
65
65
  )
66
66
  end
@@ -26,15 +26,24 @@ module MiGA::Dataset::Result
26
26
  # The values are symbols:
27
27
  # - empty: the dataset has no data
28
28
  # - inactive: the dataset is inactive
29
+ # - upstream: the task is upstream from dataset's input
29
30
  # - force: forced to ignore by metadata
30
31
  # - project: incompatible project
31
32
  # - noref: incompatible dataset, only for reference
32
33
  # - multi: incompatible dataset, only for multi
33
34
  # - nonmulti: incompatible dataset, only for nonmulti
35
+ # - complete: the task is already complete
34
36
  # - execute: do not ignore, execute the task
35
37
  def why_ignore(task)
36
- if !active?
38
+ if !get_result(task).nil?
39
+ :complete
40
+ elsif !active?
37
41
  :inactive
42
+ elsif first_preprocessing.nil?
43
+ :empty
44
+ elsif @@PREPROCESSING_TASKS.index(task) <
45
+ @@PREPROCESSING_TASKS.index(first_preprocessing)
46
+ :upstream
38
47
  elsif !metadata["run_#{task}"].nil?
39
48
  metadata["run_#{task}"] ? :execute : :force
40
49
  elsif task == :taxonomy && project.metadata[:ref_project].nil?
@@ -56,7 +65,7 @@ module MiGA::Dataset::Result
56
65
  # initial input. Passes +save+ to #add_result.
57
66
  def first_preprocessing(save = false)
58
67
  @first_processing ||= @@PREPROCESSING_TASKS.find do |t|
59
- !ignore_task?(t) && !add_result(t, save).nil?
68
+ !add_result(t, save).nil?
60
69
  end
61
70
  end
62
71
 
@@ -70,7 +79,7 @@ module MiGA::Dataset::Result
70
79
  false
71
80
  elsif add_result(t, save).nil?
72
81
  if (metadata["_try_#{t}"] || 0) > (project.metadata[:max_try] || 10)
73
- inactivate!
82
+ inactivate! "Too many errors in step #{t}"
74
83
  false
75
84
  else
76
85
  true
@@ -121,17 +130,12 @@ module MiGA::Dataset::Result
121
130
  # - complete: a task with registered results
122
131
  # - pending: a task queued to be performed
123
132
  def result_status(task)
124
- if first_preprocessing.nil?
125
- :ignore_empty
126
- elsif !get_result(task).nil?
127
- :complete
128
- elsif @@PREPROCESSING_TASKS.index(task) <
129
- @@PREPROCESSING_TASKS.index(first_preprocessing)
130
- :-
131
- elsif ignore_task?(task)
132
- :"ignore_#{why_ignore task}"
133
- else
134
- :pending
133
+ reason = why_ignore(task)
134
+ case reason
135
+ when :upstream; :-
136
+ when :execute; :pending
137
+ when :complete; :complete
138
+ else; :"ignore_#{reason}"
135
139
  end
136
140
  end
137
141
 
@@ -111,7 +111,7 @@ class MiGA::Lair < MiGA::MiGA
111
111
 
112
112
  yield(project)
113
113
  elsif Dir.exist? f
114
- each_project(f) { |project| yield(project) }
114
+ each_project(f) { |p| yield(p) }
115
115
  end
116
116
  end
117
117
  end
@@ -134,12 +134,10 @@ module MiGA::Project::Dataset
134
134
  ##
135
135
  # Are all the datasets in the project preprocessed? Save intermediate results
136
136
  # if +save+ (until the first incomplete dataset is reached).
137
- def done_preprocessing?(save = true)
138
- dataset_names.each do |dn|
139
- ds = dataset(dn)
140
- return false if ds.is_ref? and not ds.done_preprocessing?(save)
137
+ def done_preprocessing?(save = false)
138
+ !each_dataset.any? do |d|
139
+ d.ref? && d.active? && !d.done_preprocessing?(save)
141
140
  end
142
- true
143
141
  end
144
142
 
145
143
  ##
@@ -26,14 +26,15 @@ module MiGA::Project::Hooks
26
26
  end
27
27
 
28
28
  ##
29
- # Run +cmd+ in the command-line with {{variables}}: project, miga,
30
- # object (as defined by the event, if any)
29
+ # Run +cmd+ in the command-line with {{variables}}:
30
+ # project, project_name, miga, object (if defined by the event)
31
31
  # - +hook_args+: +[cmd]+
32
32
  # - +event_args+: +[object (optional)]+
33
33
  def hook_run_cmd(hook_args, event_args)
34
34
  Process.wait(
35
35
  spawn hook_args.first.miga_variables(
36
- project: path, miga: MiGA::MiGA.root_path, object: event_args.first
36
+ project: path, project_name: name,
37
+ miga: MiGA::MiGA.root_path, object: event_args.first
37
38
  )
38
39
  )
39
40
  end
@@ -94,12 +94,13 @@ class MiGA::RemoteDataset
94
94
  @timeout_try = 0
95
95
  begin
96
96
  DEBUG 'GET: ' + url
97
- open(url, read_timeout: 600) { |f| doc = f.read }
97
+ URI.parse(url).open(read_timeout: 600) { |f| doc = f.read }
98
98
  rescue => e
99
99
  @timeout_try += 1
100
100
  raise e if @timeout_try >= 3
101
101
 
102
102
  sleep 5 # <- For: 429 Too Many Requests
103
+ DEBUG "RETRYING after: #{e}"
103
104
  retry
104
105
  end
105
106
  doc
@@ -45,10 +45,6 @@ class MiGA::Result < MiGA::MiGA
45
45
  # Hash with the result metadata
46
46
  attr_reader :data
47
47
 
48
- ##
49
- # Array of MiGA::Result objects nested within the result (if any)
50
- attr_reader :results
51
-
52
48
  ##
53
49
  # Load or create the MiGA::Result described by the JSON file +path+
54
50
  def initialize(path)
@@ -78,9 +74,9 @@ class MiGA::Result < MiGA::MiGA
78
74
  when :json
79
75
  @path
80
76
  when :start
81
- @path.sub(/\.json$/, ".start")
77
+ @path.sub(/\.json$/, '.start')
82
78
  when :done
83
- @path.sub(/\.json$/, ".done")
79
+ @path.sub(/\.json$/, '.done')
84
80
  end
85
81
  end
86
82
 
@@ -134,7 +130,7 @@ class MiGA::Result < MiGA::MiGA
134
130
  ##
135
131
  # Initialize and #save empty result
136
132
  def create
137
- @data = { created: Time.now.to_s, results: [], stats: {}, files: {} }
133
+ @data = { created: Time.now.to_s, stats: {}, files: {} }
138
134
  save
139
135
  end
140
136
 
@@ -156,19 +152,20 @@ class MiGA::Result < MiGA::MiGA
156
152
  def load
157
153
  @data = MiGA::Json.parse(path)
158
154
  @data[:files] ||= {}
159
- @results = (self[:results] || []).map { |rs| MiGA::Result.new rs }
160
155
  end
161
156
 
162
157
  ##
163
158
  # Remove result, including all associated files
164
159
  def remove!
165
- each_file do |file|
166
- f = File.expand_path(file, dir)
167
- FileUtils.rm_rf(f)
168
- end
169
- %w(.start .done).each do |ext|
170
- f = path.sub(/\.json$/, ext)
171
- File.unlink f if File.exist? f
160
+ each_file { |file| FileUtils.rm_rf(File.join(dir, file)) }
161
+ unlink
162
+ end
163
+
164
+ # Unlink result by removing the .done and .start timestamps and the
165
+ # .json descriptor, but don't remove any other associated files
166
+ def unlink
167
+ %i(start done).each do |i|
168
+ f = path(i) and File.exists?(f) and File.unlink(f)
172
169
  end
173
170
  File.unlink path
174
171
  end
@@ -182,28 +179,19 @@ class MiGA::Result < MiGA::MiGA
182
179
  # Note that multiple files may have the same symbol (file_sym), since
183
180
  # arrays of files are supported.
184
181
  def each_file(&blk)
182
+ return to_enum(:each_file) unless block_given?
183
+
185
184
  @data[:files] ||= {}
186
185
  self[:files].each do |k, files|
187
186
  files = [files] unless files.kind_of? Array
188
187
  files.each do |file|
189
188
  case blk.arity
190
- when 1
191
- blk.call(file)
192
- when 2
193
- blk.call(k, file)
194
- when 3
195
- blk.call(k, file, File.expand_path(file, dir))
196
- else
197
- raise "Wrong number of arguments: #{blk.arity} for 1..3"
189
+ when 1; blk.call(file)
190
+ when 2; blk.call(k, file)
191
+ when 3; blk.call(k, file, File.expand_path(file, dir))
192
+ else; raise "Wrong number of arguments: #{blk.arity} for 1..3"
198
193
  end
199
194
  end
200
195
  end
201
196
  end
202
-
203
- ##
204
- # Add the MiGA::Result +result+ as part of the current result
205
- def add_result(result)
206
- @data[:results] << result.path
207
- save
208
- end
209
197
  end
@@ -8,6 +8,7 @@ module MiGA::Result::Stats
8
8
  # (Re-)calculate and save the statistics for the result
9
9
  def compute_stats
10
10
  method = :"compute_stats_#{key}"
11
+ MiGA::MiGA.DEBUG "Result(#{key}).compute_stats"
11
12
  stats = self.respond_to?(method, true) ? send(method) : nil
12
13
  unless stats.nil?
13
14
  self[:stats] = stats
@@ -20,28 +21,35 @@ module MiGA::Result::Stats
20
21
 
21
22
  def compute_stats_raw_reads
22
23
  stats = {}
24
+ seq_opts = { gc: true, x: true, skew: true }
23
25
  if self[:files][:pair1].nil?
24
- s = MiGA::MiGA.seqs_length(file_path(:single), :fastq, gc: true, x: true)
26
+ s = MiGA::MiGA.seqs_length(file_path(:single), :fastq, seq_opts)
25
27
  stats = {
26
28
  reads: s[:n],
27
29
  length_average: [s[:avg], 'bp'],
28
30
  length_standard_deviation: [s[:sd], 'bp'],
29
31
  g_c_content: [s[:gc], '%'],
30
- x_content: [s[:x], '%']
32
+ x_content: [s[:x], '%'],
33
+ g_c_skew: [s[:gc_skew], '%'],
34
+ a_t_skew: [s[:at_skew], '%']
31
35
  }
32
36
  else
33
- s1 = MiGA::MiGA.seqs_length(file_path(:pair1), :fastq, gc: true, x: true)
34
- s2 = MiGA::MiGA.seqs_length(file_path(:pair2), :fastq, gc: true, x: true)
37
+ s1 = MiGA::MiGA.seqs_length(file_path(:pair1), :fastq, seq_opts)
38
+ s2 = MiGA::MiGA.seqs_length(file_path(:pair2), :fastq, seq_opts)
35
39
  stats = {
36
40
  read_pairs: s1[:n],
37
41
  forward_length_average: [s1[:avg], 'bp'],
38
42
  forward_length_standard_deviation: [s1[:sd], 'bp'],
39
43
  forward_g_c_content: [s1[:gc], '%'],
40
44
  forward_x_content: [s1[:x], '%'],
45
+ forward_g_c_skew: [s1[:gc_skew], '%'],
46
+ forward_a_t_skew: [s1[:at_skew], '%'],
41
47
  reverse_length_average: [s2[:avg], 'bp'],
42
48
  reverse_length_standard_deviation: [s2[:sd], 'bp'],
43
49
  reverse_g_c_content: [s2[:gc], '%'],
44
- reverse_x_content: [s2[:x], '%']
50
+ reverse_x_content: [s2[:x], '%'],
51
+ reverse_g_c_skew: [s2[:gc_skew], '%'],
52
+ reverse_a_t_skew: [s2[:at_skew], '%']
45
53
  }
46
54
  end
47
55
  stats
@@ -49,19 +57,22 @@ module MiGA::Result::Stats
49
57
 
50
58
  def compute_stats_trimmed_fasta
51
59
  f = self[:files][:coupled].nil? ? file_path(:single) : file_path(:coupled)
52
- s = MiGA::MiGA.seqs_length(f, :fasta, gc: true, x: true)
60
+ s = MiGA::MiGA.seqs_length(f, :fasta, gc: true, x: true, skew: true)
53
61
  {
54
62
  reads: s[:n],
55
63
  length_average: [s[:avg], 'bp'],
56
64
  length_standard_deviation: [s[:sd], 'bp'],
57
65
  g_c_content: [s[:gc], '%'],
58
- x_content: [s[:x], '%']
66
+ x_content: [s[:x], '%'],
67
+ g_c_skew: [s[:gc_skew], '%'],
68
+ a_t_skew: [s[:at_skew], '%']
59
69
  }
60
70
  end
61
71
 
62
72
  def compute_stats_assembly
63
73
  s = MiGA::MiGA.seqs_length(
64
- file_path(:largecontigs), :fasta, n50: true, gc: true, x: true
74
+ file_path(:largecontigs), :fasta,
75
+ n50: true, gc: true, x: true, skew: true
65
76
  )
66
77
  {
67
78
  contigs: s[:n],
@@ -69,7 +80,9 @@ module MiGA::Result::Stats
69
80
  total_length: [s[:tot], 'bp'],
70
81
  longest_sequence: [s[:max], 'bp'],
71
82
  g_c_content: [s[:gc], '%'],
72
- x_content: [s[:x], '%']
83
+ x_content: [s[:x], '%'],
84
+ g_c_skew: [s[:gc_skew], '%'],
85
+ a_t_skew: [s[:at_skew], '%']
73
86
  }
74
87
  end
75
88
 
@@ -109,20 +122,8 @@ module MiGA::Result::Stats
109
122
  end
110
123
  end
111
124
  else
112
- # Fix estimate by domain
113
- if !(tax = source.metadata[:tax]).nil? &&
114
- %w[Archaea Bacteria].include?(tax[:d]) &&
115
- file_path(:raw_report).nil?
116
- scr = "#{MiGA::MiGA.root_path}/utils/domain-ess-genes.rb"
117
- rep = file_path(:report)
118
- rc_p = File.expand_path('.miga_rc', ENV['HOME'])
119
- rc = File.exist?(rc_p) ? ". '#{rc_p}' && " : ''
120
- $stderr.print `#{rc} ruby '#{scr}' \
121
- '#{rep}' '#{rep}.domain' '#{tax[:d][0]}'`
122
- add_file(:raw_report, "#{source.name}.ess/log")
123
- add_file(:report, "#{source.name}.ess/log.domain")
124
- end
125
- # Extract/compute quality values
125
+ # Estimate quality metrics
126
+ fix_essential_genes_by_domain
126
127
  stats = { completeness: [0.0, '%'], contamination: [0.0, '%'] }
127
128
  File.open(file_path(:report), 'r') do |fh|
128
129
  fh.each_line do |ln|
@@ -131,6 +132,8 @@ module MiGA::Result::Stats
131
132
  end
132
133
  end
133
134
  end
135
+
136
+ # Determine qualitative range
134
137
  stats[:quality] = stats[:completeness][0] - stats[:contamination][0] * 5
135
138
  source.metadata[:quality] =
136
139
  case stats[:quality]
@@ -140,6 +143,12 @@ module MiGA::Result::Stats
140
143
  else; :low
141
144
  end
142
145
  source.save
146
+
147
+ # Inactivate low-quality datasets
148
+ min_qual = (project.metadata[:min_qual] || 50)
149
+ if min_qual != 'no' && stats[:quality] < min_qual
150
+ source.inactivate! 'Low quality genome'
151
+ end
143
152
  end
144
153
  stats
145
154
  end
@@ -175,4 +184,21 @@ module MiGA::Result::Stats
175
184
  end
176
185
  stats
177
186
  end
187
+
188
+ # Fix estimates based on essential genes based on taxonomy
189
+ def fix_essential_genes_by_domain
190
+ return if (tax = source.metadata[:tax]).nil? ||
191
+ !%w[Archaea Bacteria].include?(tax[:d]) ||
192
+ file_path(:raw_report)
193
+
194
+ MiGA::MiGA.DEBUG "Fixing essential genes by domain"
195
+ scr = "#{MiGA::MiGA.root_path}/utils/domain-ess-genes.rb"
196
+ rep = file_path(:report)
197
+ rc_p = File.expand_path('.miga_rc', ENV['HOME'])
198
+ rc = File.exist?(rc_p) ? ". '#{rc_p}' && " : ''
199
+ $stderr.print `#{rc} ruby '#{scr}' \
200
+ '#{rep}' '#{rep}.domain' '#{tax[:d][0]}'`
201
+ add_file(:raw_report, "#{source.name}.ess/log")
202
+ add_file(:report, "#{source.name}.ess/log.domain")
203
+ end
178
204
  end
@@ -8,7 +8,7 @@ module MiGA
8
8
  # - Float representing the major.minor version.
9
9
  # - Integer representing gem releases of the current version.
10
10
  # - Integer representing minor changes that require new version number.
11
- VERSION = [0.7, 7, 0]
11
+ VERSION = [0.7, 11, 0]
12
12
 
13
13
  ##
14
14
  # Nickname for the current major.minor version.
@@ -16,7 +16,7 @@ module MiGA
16
16
 
17
17
  ##
18
18
  # Date of the current gem release.
19
- VERSION_DATE = Date.new(2020, 6, 4)
19
+ VERSION_DATE = Date.new(2020, 7, 1)
20
20
 
21
21
  ##
22
22
  # Reference of MiGA.
@@ -9,7 +9,12 @@ cd "$PROJECT/data/09.distances"
9
9
  # Initialize
10
10
  miga date > "$DATASET.start"
11
11
 
12
- # Run
12
+ # Check quality first
13
+ miga stats -P "$PROJECT" -D "$DATASET" -r essential_genes --compute-and-save
14
+ inactive=$(miga ls -P "$PROJECT" -D "$DATASET" -m inactive | cut -f 2)
15
+ [[ "$inactive" == "true" ]] && exit
16
+
17
+ # Run distances
13
18
  ruby -I "$MIGA/lib" "$MIGA/utils/distances.rb" "$PROJECT" "$DATASET"
14
19
 
15
20
  # Finalize
@@ -93,7 +93,7 @@ class DaemonTest < Test::Unit::TestCase
93
93
  0 => /-{20}\n/,
94
94
  1 => /MiGA:#{p.name} launched/,
95
95
  2 => /-{20}\n/,
96
- 5 => /Probing running jobs\n/
96
+ 6 => /Probing running jobs\n/
97
97
  }.each { |k, v| assert_match(v, l[k], "unexpected line: #{k}") }
98
98
  ensure
99
99
  begin
@@ -185,11 +185,13 @@ class DatasetTest < Test::Unit::TestCase
185
185
  d = dataset
186
186
  assert_equal(:incomplete, d.status)
187
187
  assert_predicate(d, :active?)
188
- d.inactivate!
188
+ d.inactivate! 'Too annoying'
189
189
  assert_equal(:inactive, d.status)
190
+ assert_equal('Inactive: Too annoying', d.metadata[:warn])
190
191
  assert_not_predicate(d, :active?)
191
192
  d.activate!
192
193
  assert_equal(:incomplete, d.status)
194
+ assert_nil(d.metadata[:warn])
193
195
  assert_predicate(d, :active?)
194
196
  end
195
197
 
@@ -108,7 +108,7 @@ class ProjectTest < Test::Unit::TestCase
108
108
  d1 = p1.add_dataset('BAH')
109
109
  assert_not_predicate(p1, :done_preprocessing?)
110
110
  FileUtils.touch(File.join(p1.path, 'data', '90.stats', "#{d1.name}.done"))
111
- assert_predicate(p1, :done_preprocessing?)
111
+ assert { p1.done_preprocessing? true }
112
112
  assert_nil(p1.next_inclade)
113
113
  p1.metadata[:type] = :clade
114
114
  assert_equal(:subclades, p1.next_inclade)
@@ -101,7 +101,7 @@ class RemoteDatasetTest < Test::Unit::TestCase
101
101
 
102
102
  def test_ref_type_status
103
103
  declare_remote_access
104
- rd = MiGA::RemoteDataset.new('GCA_002849345', :assembly, :ncbi)
104
+ rd = MiGA::RemoteDataset.new('GCA_003144295.1', :assembly, :ncbi)
105
105
  assert { !rd.get_metadata[:is_type] }
106
106
  assert { rd.get_metadata[:is_ref_type] }
107
107
  end
@@ -29,6 +29,16 @@ module MiGA::DistanceRunner::Pipeline
29
29
  classify(clades, classif, metric, result_fh, val_cls)
30
30
  end
31
31
 
32
+ # Run distances against datasets listed in metadata's +:dist_req+
33
+ def distances_by_request(metric)
34
+ return unless dataset.metadata[:dist_req]
35
+
36
+ $stderr.puts 'Running distances by request'
37
+ dataset.metadata[:dist_req].each do |target|
38
+ ds = ref_project.dataset(target) and send(metric, ds)
39
+ end
40
+ end
41
+
32
42
  # Builds a tree with all visited medoids from any classification level
33
43
  def build_medoids_tree(metric)
34
44
  $stderr.puts "Building medoids tree (metric = #{metric})"
@@ -99,7 +109,7 @@ module MiGA::DistanceRunner::Pipeline
99
109
 
100
110
  # Transfer the taxonomy to the current dataset
101
111
  def transfer_taxonomy(tax)
102
- $stderr.puts "Transferring taxonomy"
112
+ $stderr.puts 'Transferring taxonomy'
103
113
  return if tax.nil?
104
114
 
105
115
  pval = (project.metadata[:tax_pvalue] || 0.05).to_f
@@ -67,7 +67,7 @@ class MiGA::DistanceRunner
67
67
 
68
68
  # Launch analysis for reference datasets
69
69
  def go_ref!
70
- $stderr.puts "Launching analysis for reference dataset"
70
+ $stderr.puts 'Launching analysis for reference dataset'
71
71
  # Initialize databases
72
72
  initialize_dbs! true
73
73
 
@@ -80,13 +80,13 @@ class MiGA::DistanceRunner
80
80
  end
81
81
 
82
82
  # Finalize
83
- [:haai, :aai, :ani].each { |m| checkpoint! m if db_counts[m] > 0 }
83
+ %i[haai aai ani].each { |m| checkpoint! m if db_counts[m] > 0 }
84
84
  end
85
85
 
86
86
  ##
87
87
  # Launch analysis for query datasets
88
88
  def go_query!
89
- $stderr.puts "Launching analysis for query dataset"
89
+ $stderr.puts 'Launching analysis for query dataset'
90
90
  # Check if project is ready
91
91
  tsk = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
92
92
  res = ref_project.result(tsk[0])
@@ -94,6 +94,7 @@ class MiGA::DistanceRunner
94
94
 
95
95
  # Initialize the databases
96
96
  initialize_dbs! false
97
+ distances_by_request(tsk[1])
97
98
  # Calculate the classification-informed AAI/ANI traverse
98
99
  results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home)
99
100
  fh = File.open(results, 'w')
@@ -111,7 +112,9 @@ class MiGA::DistanceRunner
111
112
  next unless r[1].to_i == val_cls
112
113
 
113
114
  ani = ani_after_aai(ref_project.dataset(r[0]), 80.0)
114
- closest = { ds: r[0], ani: ani } unless ani.nil? or ani < closest[:ani]
115
+ unless ani.nil? || ani < closest[:ani]
116
+ closest = { ds: r[0], ani: ani }
117
+ end
115
118
  end
116
119
  end
117
120
  end
@@ -133,7 +136,7 @@ class MiGA::DistanceRunner
133
136
 
134
137
  # Launch analysis for taxonomy jobs
135
138
  def go_taxonomy!
136
- $stderr.puts "Launching taxonomy analysis"
139
+ $stderr.puts 'Launching taxonomy analysis'
137
140
  return unless project.metadata[:ref_project]
138
141
 
139
142
  go_query! # <- yeah, it's actually the same, just different ref_project
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.7.0
4
+ version: 0.7.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-04 00:00:00.000000000 Z
11
+ date: 2020-07-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -529,7 +529,7 @@ homepage: http://enve-omics.ce.gatech.edu/miga
529
529
  licenses:
530
530
  - Artistic-2.0
531
531
  metadata: {}
532
- post_install_message:
532
+ post_install_message:
533
533
  rdoc_options:
534
534
  - lib
535
535
  - README.md
@@ -550,8 +550,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
550
550
  - !ruby/object:Gem::Version
551
551
  version: '0'
552
552
  requirements: []
553
- rubygems_version: 3.0.3
554
- signing_key:
553
+ rubygems_version: 3.1.2
554
+ signing_key:
555
555
  specification_version: 4
556
556
  summary: MiGA
557
557
  test_files: []