miga-base 0.7.7.0 → 0.7.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3682f50e3efe936ce751cd83cc7945edddb8e1c3ea6e654c4d54f8ea79efbfcb
4
- data.tar.gz: a5bc821d8f1b6f55baf495eea28e8783c86c61ffaca2d486e4589b818a60038f
3
+ metadata.gz: 7b4a168130d732c670246cd4a874272e77e5f7d88fdef00e10d81ab8e5f9979a
4
+ data.tar.gz: '069e2dd280b4afecb67478612f1dee35bf2cada3ae57cbc61c6e70d0ef3bd233'
5
5
  SHA512:
6
- metadata.gz: 23e986949f97ae31498b7310eba666f0fc4b5f3e4ab9d38a135b2934db901449dca70ce74830e4353bb60f2196ce2c195b1bfb20400f884494e9766e58ea5214
7
- data.tar.gz: 3857008111b8a65b1fbf09442eb3a657789ebf964769c7805485be57298141c363296b7d2491ef3379726344f07892211d10d4c72fc74a8095bc0eaf00d4e873
6
+ metadata.gz: a37fd7d69339c7a63d5ac38e0c232fed96d479c3f2f2bc67b2ee956bb908d8690a55f21a6fa0185c05f209139e16b6b2ddcd6b0f36fac471f9e0b4fd2c4a5f04
7
+ data.tar.gz: cb156656c79f1a765281f163691650e32f90056dc767827d4b3fe958b4042e6359b810f5a3249c6902bce042f6a457507a027836797c39328889d9cbbbc5c5d0
data/README.md CHANGED
@@ -41,6 +41,7 @@ Developed and maintained by [Luis M. Rodriguez-R][lrr]. MiGA is the result of a
41
41
  collaboration between [Kostas Lab][kostas] at the Georgia Institute of
42
42
  Technology and [RDP][rdp] at Michigan State University.
43
43
 
44
+ See also the [complete list of contributors](manual/part1/contributors.md).
44
45
 
45
46
  # License
46
47
 
@@ -42,7 +42,9 @@ class MiGA::Cli::Action::ClassifyWf < MiGA::Cli::Action
42
42
  '--no-summaries',
43
43
  'Do not generate intermediate step summaries'
44
44
  ) { |v| cli[:summaries] = v }
45
- opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
45
+ opts_for_wf(
46
+ opt, 'Input genome assemblies (nucleotides, FastA)', qual: false
47
+ )
46
48
  end
47
49
  end
48
50
 
@@ -19,6 +19,10 @@ class MiGA::Cli::Action::DerepWf < MiGA::Cli::Action
19
19
  'Use Average Amino Acid Identity (AAI) as genome similarity metric',
20
20
  'By default: Use Average Nucleotide Identity (ANI)'
21
21
  ) { cli[:metric] = :aai }
22
+ opt.on(
23
+ '--ani',
24
+ 'Use Average Nucleotide Identity (ANI) as similarity metric (default)'
25
+ ) { cli[:metric] = :ani }
22
26
  opt.on(
23
27
  '--threshold FLOAT', Float,
24
28
  "Metric threshold (%) to dereplicate. By default: #{cli[:threshold]}"
@@ -17,18 +17,21 @@ class MiGA::Cli::Action::Edit < MiGA::Cli::Action
17
17
  'Activate dataset; requires -D'
18
18
  ) { |v| cli[:activate] = v }
19
19
  opt.on(
20
- '--inactivate',
21
- 'Inactivate dataset; requires -D'
22
- ) { |v| cli[:activate] = !v }
20
+ '--inactivate [reason]',
21
+ 'Inactivate dataset; requires -D',
22
+ 'The argument is optional: reason to inactivate dataset'
23
+ ) { |v| cli[:activate] = false ; cli[:reason] = v }
23
24
  end
24
25
  end
25
26
 
26
27
  def perform
27
28
  obj = cli.load_project_or_dataset
28
29
  unless cli[:activate].nil?
29
- cli.ensure_par({ dataset: '-D' },
30
- '%<name>s is mandatory with --[in-]activate: please provide %<flag>s')
31
- cli[:activate] ? obj.activate! : obj.inactivate!
30
+ cli.ensure_par(
31
+ { dataset: '-D' },
32
+ '%<name>s is mandatory with --[in-]activate: please provide %<flag>s'
33
+ )
34
+ cli[:activate] ? obj.activate! : obj.inactivate!(cli[:reason])
32
35
  end
33
36
  cli.add_metadata(obj)
34
37
  obj.save
@@ -15,7 +15,10 @@ class MiGA::Cli::Action::QualityWf < MiGA::Cli::Action
15
15
  '-m', '--mytaxa-scan',
16
16
  'Perform MyTaxa scan analysis'
17
17
  ) { |v| cli[:mytaxa] = v }
18
- opts_for_wf(opt, 'Input genome assemblies (nucleotides, FastA)')
18
+ opts_for_wf(
19
+ opt, 'Input genome assemblies (nucleotides, FastA)',
20
+ qual: false
21
+ )
19
22
  end
20
23
  end
21
24
 
@@ -14,7 +14,7 @@ class MiGA::Cli::Action::Stats < MiGA::Cli::Action
14
14
  ) { |v| cli[:key] = v }
15
15
  opt.on(
16
16
  '--compute-and-save',
17
- 'Compute and saves the statistics'
17
+ 'Compute and save the statistics'
18
18
  ) { |v| cli[:compute] = v }
19
19
  opt.on(
20
20
  '--try-load',
@@ -38,7 +38,7 @@ class MiGA::Cli::Action::Stats < MiGA::Cli::Action
38
38
  end
39
39
  if cli[:key].nil?
40
40
  r[:stats].each do |k, v|
41
- k_n = k == :g_c_content ? 'G+C content' : k.to_s.unmiga_name.capitalize
41
+ k_n = k.to_s.unmiga_name.sub(/^./, &:upcase)
42
42
  cli.puts "#{k_n}: #{v.is_a?(Array) ? v.join(' ') : v}"
43
43
  end
44
44
  else
@@ -26,6 +26,10 @@ class MiGA::Cli::Action::Summary < MiGA::Cli::Action
26
26
  '--with-units',
27
27
  'Include units in each cell'
28
28
  ) { |v| cli[:units] = v }
29
+ opt.on(
30
+ '--compute-and-save',
31
+ 'Compute and save the statistics if not yet available'
32
+ ) { |v| cli[:compute] = v }
29
33
  end
30
34
  end
31
35
 
@@ -34,7 +38,8 @@ class MiGA::Cli::Action::Summary < MiGA::Cli::Action
34
38
  ds = cli.load_and_filter_datasets
35
39
  cli.say 'Loading results'
36
40
  stats = ds.map do |d|
37
- r = d.add_result(cli[:result].to_sym, false)
41
+ r = d.result(cli[:result])
42
+ r.compute_stats if cli[:compute] && !r.nil? && r[:stats].empty?
38
43
  s = r.nil? ? {} : r[:stats]
39
44
  s.tap { |i| i[:dataset] = d.name }
40
45
  end
@@ -15,7 +15,7 @@ module MiGA::Cli::Action::Wf
15
15
 
16
16
  def opts_for_wf(opt, files_desc, params = {})
17
17
  {
18
- multi: false, cleanup: true, project_type: false, ncbi: true
18
+ multi: false, cleanup: true, project_type: false, ncbi: true, qual: true
19
19
  }.each { |k, v| params[k] = v if params[k].nil? }
20
20
  opt.on(
21
21
  '-o', '--out_dir PATH',
@@ -40,6 +40,13 @@ module MiGA::Cli::Action::Wf
40
40
  'Only download complete genomes, not drafts'
41
41
  ) { |v| cli[:ncbi_draft] = v }
42
42
  end
43
+ if params[:qual]
44
+ opt.on(
45
+ '--min-qual FLOAT', Float,
46
+ 'Minimum genome quality to include in analysis',
47
+ 'By default: 50.0'
48
+ ) { |v| cli[:min_qual] = v }
49
+ end
43
50
  if params[:cleanup]
44
51
  opt.on(
45
52
  '-c', '--clean',
@@ -89,6 +96,10 @@ module MiGA::Cli::Action::Wf
89
96
  end
90
97
 
91
98
  def opts_for_wf_distances(opt)
99
+ opt.on('--sensitive', 'Alias to: --aai-p blast+ --ani-p blast+') do
100
+ cli[:aai_p] = 'blast+'
101
+ cli[:ani_p] = 'blast+'
102
+ end
92
103
  opt.on('--fast', 'Alias to: --aai-p diamond --ani-p fastani') do
93
104
  cli[:aai_p] = 'diamond'
94
105
  cli[:ani_p] = 'fastani'
@@ -121,7 +132,7 @@ module MiGA::Cli::Action::Wf
121
132
  ]) unless MiGA::Project.exist? cli[:outdir]
122
133
  # Define project metadata
123
134
  p = cli.load_project(:outdir, '-o')
124
- [:haai_p, :aai_p, :ani_p, :ess_coll].each { |i| p_metadata[i] = cli[i] }
135
+ %i[haai_p aai_p ani_p ess_coll min_qual].each { |i| p_metadata[i] = cli[i] }
125
136
  p_metadata[:type] = cli[:project_type]
126
137
  transfer_metadata(p, p_metadata)
127
138
  # Download datasets
@@ -155,7 +166,7 @@ module MiGA::Cli::Action::Wf
155
166
  '-P', cli[:outdir],
156
167
  '-r', r,
157
168
  '-o', File.expand_path("#{r}.tsv", cli[:outdir]),
158
- '--tab'
169
+ '--tab', '--ref', '--active'
159
170
  ])
160
171
  end
161
172
  end
@@ -68,15 +68,20 @@ module MiGA::Common::Format
68
68
  # a FastA or FastQ file (supports gzipped files). The +format+ must be a
69
69
  # Symbol, one of +:fasta+ or +:fastq+. Additional estimations can be
70
70
  # controlled via the +opts+ Hash. Supported options include:
71
- # - +:n50+: If true, it also returns the N50 and the median (in bp)
72
- # - +:gc+: If true, it also returns the G+C content (in %)
73
- # - +:x+: If true, it also returns the undetermined bases content (in %)
71
+ # - +:n50+: Include the N50 and the median (in bp)
72
+ # - +:gc+: Include the G+C content (in %)
73
+ # - +:x+: Include the undetermined bases content (in %)
74
+ # - +:skew+: Include G-C and A-T sequence skew (in %; forces gc: true).
75
+ # See definition used here in DOI:10.1177/117693430700300006
74
76
  def seqs_length(file, format, opts = {})
77
+ opts[:gc] = true if opts[:skew]
75
78
  fh = file =~ /\.gz/ ? Zlib::GzipReader.open(file) : File.open(file, 'r')
76
79
  l = []
77
80
  gc = 0
78
81
  xn = 0
79
- i = 0 # <- Zlib::GzipReader doesn't set `$.`
82
+ t = 0
83
+ c = 0
84
+ i = 0 # <- Zlib::GzipReader doesn't set `$.`
80
85
  fh.each_line do |ln|
81
86
  i += 1
82
87
  if (format == :fasta and ln =~ /^>/) or
@@ -86,16 +91,27 @@ module MiGA::Common::Format
86
91
  l[l.size - 1] += ln.chomp.size
87
92
  gc += ln.scan(/[GCgc]/).count if opts[:gc]
88
93
  xn += ln.scan(/[XNxn]/).count if opts[:x]
94
+ if opts[:skew]
95
+ t += ln.scan(/[Tt]/).count
96
+ c += ln.scan(/[Cc]/).count
97
+ end
89
98
  end
90
99
  end
91
100
  fh.close
92
101
 
93
- o = { n: l.size, tot: l.inject(:+), max: l.max }
102
+ o = { n: l.size, tot: l.inject(0, :+), max: l.max }
103
+ return o if o[:tot].zero?
94
104
  o[:avg] = o[:tot].to_f / l.size
95
105
  o[:var] = l.map { |a| a**2 }.inject(:+).to_f / l.size - o[:avg]**2
96
106
  o[:sd] = Math.sqrt o[:var]
97
107
  o[:gc] = 100.0 * gc / o[:tot] if opts[:gc]
98
108
  o[:x] = 100.0 * xn / o[:tot] if opts[:x]
109
+ if opts[:skew]
110
+ at = o[:tot] - gc
111
+ o[:at_skew] = 100.0 * (2 * t - at) / at
112
+ o[:gc_skew] = 100.0 * (2 * c - gc) / gc
113
+ end
114
+
99
115
  if opts[:n50]
100
116
  l.sort!
101
117
  thr = o[:tot] / 2
@@ -106,7 +122,8 @@ module MiGA::Common::Format
106
122
  break if pos >= thr
107
123
  end
108
124
  o[:med] = o[:n].even? ?
109
- 0.5 * l[o[:n] / 2 - 1, 2].inject(:+) : l[(o[:n] - 1) / 2]
125
+ 0.5 * l[o[:n] / 2 - 1, 2].inject(:+) :
126
+ l[(o[:n] - 1) / 2]
110
127
  end
111
128
  o
112
129
  end
@@ -130,9 +147,14 @@ class String
130
147
  end
131
148
 
132
149
  ##
133
- # Replace underscores by spaces or dots (depending on context).
150
+ # Replace underscores by spaces or other symbols depending on context
134
151
  def unmiga_name
135
- gsub(/_(str|sp|subsp|pv)__/, '_\\1._').tr('_', ' ')
152
+ gsub(/_(str|sp|subsp|pv)__/, '_\\1._')
153
+ .gsub(/g_c_(content)/, 'G+C \\1')
154
+ .gsub(/g_c_(skew)/, 'G-C \\1')
155
+ .gsub(/a_t_(skew)/, 'A-T \\1')
156
+ .gsub(/x_content/, &:capitalize)
157
+ .tr('_', ' ')
136
158
  end
137
159
 
138
160
  ##
@@ -72,6 +72,7 @@ class MiGA::Daemon < MiGA::MiGA
72
72
  say '-----------------------------------'
73
73
  say 'MiGA:%s launched' % project.name
74
74
  say '-----------------------------------'
75
+ recalculate_status!
75
76
  load_status
76
77
  say 'Configuration options:'
77
78
  say @runopts.to_s
@@ -99,6 +100,7 @@ class MiGA::Daemon < MiGA::MiGA
99
100
  end
100
101
 
101
102
  def recalculate_status!
103
+ say 'Recalculating status for all datasets'
102
104
  project.each_dataset(&:recalculate_status)
103
105
  end
104
106
 
@@ -158,8 +160,8 @@ class MiGA::Daemon < MiGA::MiGA
158
160
  end
159
161
 
160
162
  ##
161
- # Traverse datasets, and returns boolean indicating if at any datasets
162
- # are incomplete
163
+ # Traverse datasets, and returns boolean indicating if at any reference
164
+ # datasets are incomplete
163
165
  def check_datasets
164
166
  l_say(2, 'Checking datasets')
165
167
  o = false
@@ -167,7 +169,7 @@ class MiGA::Daemon < MiGA::MiGA
167
169
  next unless ds.status == :incomplete
168
170
  next if ds.next_preprocessing(false).nil?
169
171
 
170
- o = true
172
+ o = true if ds.ref?
171
173
  queue_job(:d, ds)
172
174
  end
173
175
  o
@@ -183,7 +185,7 @@ class MiGA::Daemon < MiGA::MiGA
183
185
  return if project.dataset_names.empty?
184
186
 
185
187
  # Double-check if all datasets are ready
186
- return unless project.done_preprocessing?(false)
188
+ return unless project.done_preprocessing?
187
189
 
188
190
  # Queue project-level job
189
191
  to_run = project.next_task(nil, false)
@@ -97,7 +97,10 @@ class MiGA::Dataset < MiGA::MiGA
97
97
 
98
98
  ##
99
99
  # Inactivate a dataset. This halts automated processing by the daemon
100
- def inactivate!
100
+ #
101
+ # If given, the +reason+ string is saved as a metadata +:warn+ entry
102
+ def inactivate!(reason = nil)
103
+ metadata[:warn] = "Inactive: #{reason}" unless reason.nil?
101
104
  metadata[:inactive] = true
102
105
  metadata.save
103
106
  pull_hook :on_inactivate
@@ -107,6 +110,7 @@ class MiGA::Dataset < MiGA::MiGA
107
110
  # Activate a dataset. This removes the +:inactive+ flag
108
111
  def activate!
109
112
  metadata[:inactive] = nil
113
+ metadata[:warn] = nil if metadata[:warn] && metadata[:warn] =~ /^Inactive: /
110
114
  metadata.save
111
115
  pull_hook :on_activate
112
116
  end
@@ -35,8 +35,8 @@ module MiGA::Dataset::Base
35
35
  mytaxa: '07.annotation/02.taxonomy/01.mytaxa',
36
36
  mytaxa_scan: '07.annotation/03.qa/02.mytaxa_scan',
37
37
  # Distances (for single-species datasets)
38
- distances: '09.distances',
39
38
  taxonomy: '09.distances/05.taxonomy',
39
+ distances: '09.distances',
40
40
  # General statistics
41
41
  stats: '90.stats'
42
42
  }
@@ -52,15 +52,15 @@ module MiGA::Dataset::Hooks
52
52
  end
53
53
 
54
54
  ##
55
- # Run +cmd+ in the command-line with {{variables}}: dataset, project, miga,
56
- # object (as defined for the event, if any)
55
+ # Run +cmd+ in the command-line with {{variables}}:
56
+ # dataset, project, project_name, miga, object (if defined for the event)
57
57
  # - +hook_args+: +[cmd]+
58
58
  # - +event_args+: +[object (optional)]+
59
59
  def hook_run_cmd(hook_args, event_args)
60
60
  Process.wait(
61
61
  spawn hook_args.first.miga_variables(
62
- dataset: name, project: project.path, miga: MiGA::MiGA.root_path,
63
- object: event_args.first
62
+ dataset: name, project: project.path, project_name: project.name,
63
+ miga: MiGA::MiGA.root_path, object: event_args.first
64
64
  )
65
65
  )
66
66
  end
@@ -26,15 +26,24 @@ module MiGA::Dataset::Result
26
26
  # The values are symbols:
27
27
  # - empty: the dataset has no data
28
28
  # - inactive: the dataset is inactive
29
+ # - upstream: the task is upstream from dataset's input
29
30
  # - force: forced to ignore by metadata
30
31
  # - project: incompatible project
31
32
  # - noref: incompatible dataset, only for reference
32
33
  # - multi: incompatible dataset, only for multi
33
34
  # - nonmulti: incompatible dataset, only for nonmulti
35
+ # - complete: the task is already complete
34
36
  # - execute: do not ignore, execute the task
35
37
  def why_ignore(task)
36
- if !active?
38
+ if !get_result(task).nil?
39
+ :complete
40
+ elsif !active?
37
41
  :inactive
42
+ elsif first_preprocessing.nil?
43
+ :empty
44
+ elsif @@PREPROCESSING_TASKS.index(task) <
45
+ @@PREPROCESSING_TASKS.index(first_preprocessing)
46
+ :upstream
38
47
  elsif !metadata["run_#{task}"].nil?
39
48
  metadata["run_#{task}"] ? :execute : :force
40
49
  elsif task == :taxonomy && project.metadata[:ref_project].nil?
@@ -56,7 +65,7 @@ module MiGA::Dataset::Result
56
65
  # initial input. Passes +save+ to #add_result.
57
66
  def first_preprocessing(save = false)
58
67
  @first_processing ||= @@PREPROCESSING_TASKS.find do |t|
59
- !ignore_task?(t) && !add_result(t, save).nil?
68
+ !add_result(t, save).nil?
60
69
  end
61
70
  end
62
71
 
@@ -70,7 +79,7 @@ module MiGA::Dataset::Result
70
79
  false
71
80
  elsif add_result(t, save).nil?
72
81
  if (metadata["_try_#{t}"] || 0) > (project.metadata[:max_try] || 10)
73
- inactivate!
82
+ inactivate! "Too many errors in step #{t}"
74
83
  false
75
84
  else
76
85
  true
@@ -121,17 +130,12 @@ module MiGA::Dataset::Result
121
130
  # - complete: a task with registered results
122
131
  # - pending: a task queued to be performed
123
132
  def result_status(task)
124
- if first_preprocessing.nil?
125
- :ignore_empty
126
- elsif !get_result(task).nil?
127
- :complete
128
- elsif @@PREPROCESSING_TASKS.index(task) <
129
- @@PREPROCESSING_TASKS.index(first_preprocessing)
130
- :-
131
- elsif ignore_task?(task)
132
- :"ignore_#{why_ignore task}"
133
- else
134
- :pending
133
+ reason = why_ignore(task)
134
+ case reason
135
+ when :upstream; :-
136
+ when :execute; :pending
137
+ when :complete; :complete
138
+ else; :"ignore_#{reason}"
135
139
  end
136
140
  end
137
141
 
@@ -111,7 +111,7 @@ class MiGA::Lair < MiGA::MiGA
111
111
 
112
112
  yield(project)
113
113
  elsif Dir.exist? f
114
- each_project(f) { |project| yield(project) }
114
+ each_project(f) { |p| yield(p) }
115
115
  end
116
116
  end
117
117
  end
@@ -134,12 +134,10 @@ module MiGA::Project::Dataset
134
134
  ##
135
135
  # Are all the datasets in the project preprocessed? Save intermediate results
136
136
  # if +save+ (until the first incomplete dataset is reached).
137
- def done_preprocessing?(save = true)
138
- dataset_names.each do |dn|
139
- ds = dataset(dn)
140
- return false if ds.is_ref? and not ds.done_preprocessing?(save)
137
+ def done_preprocessing?(save = false)
138
+ !each_dataset.any? do |d|
139
+ d.ref? && d.active? && !d.done_preprocessing?(save)
141
140
  end
142
- true
143
141
  end
144
142
 
145
143
  ##
@@ -26,14 +26,15 @@ module MiGA::Project::Hooks
26
26
  end
27
27
 
28
28
  ##
29
- # Run +cmd+ in the command-line with {{variables}}: project, miga,
30
- # object (as defined by the event, if any)
29
+ # Run +cmd+ in the command-line with {{variables}}:
30
+ # project, project_name, miga, object (if defined by the event)
31
31
  # - +hook_args+: +[cmd]+
32
32
  # - +event_args+: +[object (optional)]+
33
33
  def hook_run_cmd(hook_args, event_args)
34
34
  Process.wait(
35
35
  spawn hook_args.first.miga_variables(
36
- project: path, miga: MiGA::MiGA.root_path, object: event_args.first
36
+ project: path, project_name: name,
37
+ miga: MiGA::MiGA.root_path, object: event_args.first
37
38
  )
38
39
  )
39
40
  end
@@ -94,12 +94,13 @@ class MiGA::RemoteDataset
94
94
  @timeout_try = 0
95
95
  begin
96
96
  DEBUG 'GET: ' + url
97
- open(url, read_timeout: 600) { |f| doc = f.read }
97
+ URI.parse(url).open(read_timeout: 600) { |f| doc = f.read }
98
98
  rescue => e
99
99
  @timeout_try += 1
100
100
  raise e if @timeout_try >= 3
101
101
 
102
102
  sleep 5 # <- For: 429 Too Many Requests
103
+ DEBUG "RETRYING after: #{e}"
103
104
  retry
104
105
  end
105
106
  doc
@@ -45,10 +45,6 @@ class MiGA::Result < MiGA::MiGA
45
45
  # Hash with the result metadata
46
46
  attr_reader :data
47
47
 
48
- ##
49
- # Array of MiGA::Result objects nested within the result (if any)
50
- attr_reader :results
51
-
52
48
  ##
53
49
  # Load or create the MiGA::Result described by the JSON file +path+
54
50
  def initialize(path)
@@ -78,9 +74,9 @@ class MiGA::Result < MiGA::MiGA
78
74
  when :json
79
75
  @path
80
76
  when :start
81
- @path.sub(/\.json$/, ".start")
77
+ @path.sub(/\.json$/, '.start')
82
78
  when :done
83
- @path.sub(/\.json$/, ".done")
79
+ @path.sub(/\.json$/, '.done')
84
80
  end
85
81
  end
86
82
 
@@ -134,7 +130,7 @@ class MiGA::Result < MiGA::MiGA
134
130
  ##
135
131
  # Initialize and #save empty result
136
132
  def create
137
- @data = { created: Time.now.to_s, results: [], stats: {}, files: {} }
133
+ @data = { created: Time.now.to_s, stats: {}, files: {} }
138
134
  save
139
135
  end
140
136
 
@@ -156,19 +152,20 @@ class MiGA::Result < MiGA::MiGA
156
152
  def load
157
153
  @data = MiGA::Json.parse(path)
158
154
  @data[:files] ||= {}
159
- @results = (self[:results] || []).map { |rs| MiGA::Result.new rs }
160
155
  end
161
156
 
162
157
  ##
163
158
  # Remove result, including all associated files
164
159
  def remove!
165
- each_file do |file|
166
- f = File.expand_path(file, dir)
167
- FileUtils.rm_rf(f)
168
- end
169
- %w(.start .done).each do |ext|
170
- f = path.sub(/\.json$/, ext)
171
- File.unlink f if File.exist? f
160
+ each_file { |file| FileUtils.rm_rf(File.join(dir, file)) }
161
+ unlink
162
+ end
163
+
164
+ # Unlink result by removing the .done and .start timestamps and the
165
+ # .json descriptor, but don't remove any other associated files
166
+ def unlink
167
+ %i(start done).each do |i|
168
+ f = path(i) and File.exists?(f) and File.unlink(f)
172
169
  end
173
170
  File.unlink path
174
171
  end
@@ -182,28 +179,19 @@ class MiGA::Result < MiGA::MiGA
182
179
  # Note that multiple files may have the same symbol (file_sym), since
183
180
  # arrays of files are supported.
184
181
  def each_file(&blk)
182
+ return to_enum(:each_file) unless block_given?
183
+
185
184
  @data[:files] ||= {}
186
185
  self[:files].each do |k, files|
187
186
  files = [files] unless files.kind_of? Array
188
187
  files.each do |file|
189
188
  case blk.arity
190
- when 1
191
- blk.call(file)
192
- when 2
193
- blk.call(k, file)
194
- when 3
195
- blk.call(k, file, File.expand_path(file, dir))
196
- else
197
- raise "Wrong number of arguments: #{blk.arity} for 1..3"
189
+ when 1; blk.call(file)
190
+ when 2; blk.call(k, file)
191
+ when 3; blk.call(k, file, File.expand_path(file, dir))
192
+ else; raise "Wrong number of arguments: #{blk.arity} for 1..3"
198
193
  end
199
194
  end
200
195
  end
201
196
  end
202
-
203
- ##
204
- # Add the MiGA::Result +result+ as part of the current result
205
- def add_result(result)
206
- @data[:results] << result.path
207
- save
208
- end
209
197
  end
@@ -8,6 +8,7 @@ module MiGA::Result::Stats
8
8
  # (Re-)calculate and save the statistics for the result
9
9
  def compute_stats
10
10
  method = :"compute_stats_#{key}"
11
+ MiGA::MiGA.DEBUG "Result(#{key}).compute_stats"
11
12
  stats = self.respond_to?(method, true) ? send(method) : nil
12
13
  unless stats.nil?
13
14
  self[:stats] = stats
@@ -20,28 +21,35 @@ module MiGA::Result::Stats
20
21
 
21
22
  def compute_stats_raw_reads
22
23
  stats = {}
24
+ seq_opts = { gc: true, x: true, skew: true }
23
25
  if self[:files][:pair1].nil?
24
- s = MiGA::MiGA.seqs_length(file_path(:single), :fastq, gc: true, x: true)
26
+ s = MiGA::MiGA.seqs_length(file_path(:single), :fastq, seq_opts)
25
27
  stats = {
26
28
  reads: s[:n],
27
29
  length_average: [s[:avg], 'bp'],
28
30
  length_standard_deviation: [s[:sd], 'bp'],
29
31
  g_c_content: [s[:gc], '%'],
30
- x_content: [s[:x], '%']
32
+ x_content: [s[:x], '%'],
33
+ g_c_skew: [s[:gc_skew], '%'],
34
+ a_t_skew: [s[:at_skew], '%']
31
35
  }
32
36
  else
33
- s1 = MiGA::MiGA.seqs_length(file_path(:pair1), :fastq, gc: true, x: true)
34
- s2 = MiGA::MiGA.seqs_length(file_path(:pair2), :fastq, gc: true, x: true)
37
+ s1 = MiGA::MiGA.seqs_length(file_path(:pair1), :fastq, seq_opts)
38
+ s2 = MiGA::MiGA.seqs_length(file_path(:pair2), :fastq, seq_opts)
35
39
  stats = {
36
40
  read_pairs: s1[:n],
37
41
  forward_length_average: [s1[:avg], 'bp'],
38
42
  forward_length_standard_deviation: [s1[:sd], 'bp'],
39
43
  forward_g_c_content: [s1[:gc], '%'],
40
44
  forward_x_content: [s1[:x], '%'],
45
+ forward_g_c_skew: [s1[:gc_skew], '%'],
46
+ forward_a_t_skew: [s1[:at_skew], '%'],
41
47
  reverse_length_average: [s2[:avg], 'bp'],
42
48
  reverse_length_standard_deviation: [s2[:sd], 'bp'],
43
49
  reverse_g_c_content: [s2[:gc], '%'],
44
- reverse_x_content: [s2[:x], '%']
50
+ reverse_x_content: [s2[:x], '%'],
51
+ reverse_g_c_skew: [s2[:gc_skew], '%'],
52
+ reverse_a_t_skew: [s2[:at_skew], '%']
45
53
  }
46
54
  end
47
55
  stats
@@ -49,19 +57,22 @@ module MiGA::Result::Stats
49
57
 
50
58
  def compute_stats_trimmed_fasta
51
59
  f = self[:files][:coupled].nil? ? file_path(:single) : file_path(:coupled)
52
- s = MiGA::MiGA.seqs_length(f, :fasta, gc: true, x: true)
60
+ s = MiGA::MiGA.seqs_length(f, :fasta, gc: true, x: true, skew: true)
53
61
  {
54
62
  reads: s[:n],
55
63
  length_average: [s[:avg], 'bp'],
56
64
  length_standard_deviation: [s[:sd], 'bp'],
57
65
  g_c_content: [s[:gc], '%'],
58
- x_content: [s[:x], '%']
66
+ x_content: [s[:x], '%'],
67
+ g_c_skew: [s[:gc_skew], '%'],
68
+ a_t_skew: [s[:at_skew], '%']
59
69
  }
60
70
  end
61
71
 
62
72
  def compute_stats_assembly
63
73
  s = MiGA::MiGA.seqs_length(
64
- file_path(:largecontigs), :fasta, n50: true, gc: true, x: true
74
+ file_path(:largecontigs), :fasta,
75
+ n50: true, gc: true, x: true, skew: true
65
76
  )
66
77
  {
67
78
  contigs: s[:n],
@@ -69,7 +80,9 @@ module MiGA::Result::Stats
69
80
  total_length: [s[:tot], 'bp'],
70
81
  longest_sequence: [s[:max], 'bp'],
71
82
  g_c_content: [s[:gc], '%'],
72
- x_content: [s[:x], '%']
83
+ x_content: [s[:x], '%'],
84
+ g_c_skew: [s[:gc_skew], '%'],
85
+ a_t_skew: [s[:at_skew], '%']
73
86
  }
74
87
  end
75
88
 
@@ -109,20 +122,8 @@ module MiGA::Result::Stats
109
122
  end
110
123
  end
111
124
  else
112
- # Fix estimate by domain
113
- if !(tax = source.metadata[:tax]).nil? &&
114
- %w[Archaea Bacteria].include?(tax[:d]) &&
115
- file_path(:raw_report).nil?
116
- scr = "#{MiGA::MiGA.root_path}/utils/domain-ess-genes.rb"
117
- rep = file_path(:report)
118
- rc_p = File.expand_path('.miga_rc', ENV['HOME'])
119
- rc = File.exist?(rc_p) ? ". '#{rc_p}' && " : ''
120
- $stderr.print `#{rc} ruby '#{scr}' \
121
- '#{rep}' '#{rep}.domain' '#{tax[:d][0]}'`
122
- add_file(:raw_report, "#{source.name}.ess/log")
123
- add_file(:report, "#{source.name}.ess/log.domain")
124
- end
125
- # Extract/compute quality values
125
+ # Estimate quality metrics
126
+ fix_essential_genes_by_domain
126
127
  stats = { completeness: [0.0, '%'], contamination: [0.0, '%'] }
127
128
  File.open(file_path(:report), 'r') do |fh|
128
129
  fh.each_line do |ln|
@@ -131,6 +132,8 @@ module MiGA::Result::Stats
131
132
  end
132
133
  end
133
134
  end
135
+
136
+ # Determine qualitative range
134
137
  stats[:quality] = stats[:completeness][0] - stats[:contamination][0] * 5
135
138
  source.metadata[:quality] =
136
139
  case stats[:quality]
@@ -140,6 +143,12 @@ module MiGA::Result::Stats
140
143
  else; :low
141
144
  end
142
145
  source.save
146
+
147
+ # Inactivate low-quality datasets
148
+ min_qual = (project.metadata[:min_qual] || 50)
149
+ if min_qual != 'no' && stats[:quality] < min_qual
150
+ source.inactivate! 'Low quality genome'
151
+ end
143
152
  end
144
153
  stats
145
154
  end
@@ -175,4 +184,21 @@ module MiGA::Result::Stats
175
184
  end
176
185
  stats
177
186
  end
187
+
188
+ # Fix estimates based on essential genes based on taxonomy
189
+ def fix_essential_genes_by_domain
190
+ return if (tax = source.metadata[:tax]).nil? ||
191
+ !%w[Archaea Bacteria].include?(tax[:d]) ||
192
+ file_path(:raw_report)
193
+
194
+ MiGA::MiGA.DEBUG "Fixing essential genes by domain"
195
+ scr = "#{MiGA::MiGA.root_path}/utils/domain-ess-genes.rb"
196
+ rep = file_path(:report)
197
+ rc_p = File.expand_path('.miga_rc', ENV['HOME'])
198
+ rc = File.exist?(rc_p) ? ". '#{rc_p}' && " : ''
199
+ $stderr.print `#{rc} ruby '#{scr}' \
200
+ '#{rep}' '#{rep}.domain' '#{tax[:d][0]}'`
201
+ add_file(:raw_report, "#{source.name}.ess/log")
202
+ add_file(:report, "#{source.name}.ess/log.domain")
203
+ end
178
204
  end
@@ -8,7 +8,7 @@ module MiGA
8
8
  # - Float representing the major.minor version.
9
9
  # - Integer representing gem releases of the current version.
10
10
  # - Integer representing minor changes that require new version number.
11
- VERSION = [0.7, 7, 0]
11
+ VERSION = [0.7, 11, 0]
12
12
 
13
13
  ##
14
14
  # Nickname for the current major.minor version.
@@ -16,7 +16,7 @@ module MiGA
16
16
 
17
17
  ##
18
18
  # Date of the current gem release.
19
- VERSION_DATE = Date.new(2020, 6, 4)
19
+ VERSION_DATE = Date.new(2020, 7, 1)
20
20
 
21
21
  ##
22
22
  # Reference of MiGA.
@@ -9,7 +9,12 @@ cd "$PROJECT/data/09.distances"
9
9
  # Initialize
10
10
  miga date > "$DATASET.start"
11
11
 
12
- # Run
12
+ # Check quality first
13
+ miga stats -P "$PROJECT" -D "$DATASET" -r essential_genes --compute-and-save
14
+ inactive=$(miga ls -P "$PROJECT" -D "$DATASET" -m inactive | cut -f 2)
15
+ [[ "$inactive" == "true" ]] && exit
16
+
17
+ # Run distances
13
18
  ruby -I "$MIGA/lib" "$MIGA/utils/distances.rb" "$PROJECT" "$DATASET"
14
19
 
15
20
  # Finalize
@@ -93,7 +93,7 @@ class DaemonTest < Test::Unit::TestCase
93
93
  0 => /-{20}\n/,
94
94
  1 => /MiGA:#{p.name} launched/,
95
95
  2 => /-{20}\n/,
96
- 5 => /Probing running jobs\n/
96
+ 6 => /Probing running jobs\n/
97
97
  }.each { |k, v| assert_match(v, l[k], "unexpected line: #{k}") }
98
98
  ensure
99
99
  begin
@@ -185,11 +185,13 @@ class DatasetTest < Test::Unit::TestCase
185
185
  d = dataset
186
186
  assert_equal(:incomplete, d.status)
187
187
  assert_predicate(d, :active?)
188
- d.inactivate!
188
+ d.inactivate! 'Too annoying'
189
189
  assert_equal(:inactive, d.status)
190
+ assert_equal('Inactive: Too annoying', d.metadata[:warn])
190
191
  assert_not_predicate(d, :active?)
191
192
  d.activate!
192
193
  assert_equal(:incomplete, d.status)
194
+ assert_nil(d.metadata[:warn])
193
195
  assert_predicate(d, :active?)
194
196
  end
195
197
 
@@ -108,7 +108,7 @@ class ProjectTest < Test::Unit::TestCase
108
108
  d1 = p1.add_dataset('BAH')
109
109
  assert_not_predicate(p1, :done_preprocessing?)
110
110
  FileUtils.touch(File.join(p1.path, 'data', '90.stats', "#{d1.name}.done"))
111
- assert_predicate(p1, :done_preprocessing?)
111
+ assert { p1.done_preprocessing? true }
112
112
  assert_nil(p1.next_inclade)
113
113
  p1.metadata[:type] = :clade
114
114
  assert_equal(:subclades, p1.next_inclade)
@@ -101,7 +101,7 @@ class RemoteDatasetTest < Test::Unit::TestCase
101
101
 
102
102
  def test_ref_type_status
103
103
  declare_remote_access
104
- rd = MiGA::RemoteDataset.new('GCA_002849345', :assembly, :ncbi)
104
+ rd = MiGA::RemoteDataset.new('GCA_003144295.1', :assembly, :ncbi)
105
105
  assert { !rd.get_metadata[:is_type] }
106
106
  assert { rd.get_metadata[:is_ref_type] }
107
107
  end
@@ -29,6 +29,16 @@ module MiGA::DistanceRunner::Pipeline
29
29
  classify(clades, classif, metric, result_fh, val_cls)
30
30
  end
31
31
 
32
+ # Run distances against datasets listed in metadata's +:dist_req+
33
+ def distances_by_request(metric)
34
+ return unless dataset.metadata[:dist_req]
35
+
36
+ $stderr.puts 'Running distances by request'
37
+ dataset.metadata[:dist_req].each do |target|
38
+ ds = ref_project.dataset(target) and send(metric, ds)
39
+ end
40
+ end
41
+
32
42
  # Builds a tree with all visited medoids from any classification level
33
43
  def build_medoids_tree(metric)
34
44
  $stderr.puts "Building medoids tree (metric = #{metric})"
@@ -99,7 +109,7 @@ module MiGA::DistanceRunner::Pipeline
99
109
 
100
110
  # Transfer the taxonomy to the current dataset
101
111
  def transfer_taxonomy(tax)
102
- $stderr.puts "Transferring taxonomy"
112
+ $stderr.puts 'Transferring taxonomy'
103
113
  return if tax.nil?
104
114
 
105
115
  pval = (project.metadata[:tax_pvalue] || 0.05).to_f
@@ -67,7 +67,7 @@ class MiGA::DistanceRunner
67
67
 
68
68
  # Launch analysis for reference datasets
69
69
  def go_ref!
70
- $stderr.puts "Launching analysis for reference dataset"
70
+ $stderr.puts 'Launching analysis for reference dataset'
71
71
  # Initialize databases
72
72
  initialize_dbs! true
73
73
 
@@ -80,13 +80,13 @@ class MiGA::DistanceRunner
80
80
  end
81
81
 
82
82
  # Finalize
83
- [:haai, :aai, :ani].each { |m| checkpoint! m if db_counts[m] > 0 }
83
+ %i[haai aai ani].each { |m| checkpoint! m if db_counts[m] > 0 }
84
84
  end
85
85
 
86
86
  ##
87
87
  # Launch analysis for query datasets
88
88
  def go_query!
89
- $stderr.puts "Launching analysis for query dataset"
89
+ $stderr.puts 'Launching analysis for query dataset'
90
90
  # Check if project is ready
91
91
  tsk = ref_project.is_clade? ? [:subclades, :ani] : [:clade_finding, :aai]
92
92
  res = ref_project.result(tsk[0])
@@ -94,6 +94,7 @@ class MiGA::DistanceRunner
94
94
 
95
95
  # Initialize the databases
96
96
  initialize_dbs! false
97
+ distances_by_request(tsk[1])
97
98
  # Calculate the classification-informed AAI/ANI traverse
98
99
  results = File.expand_path("#{dataset.name}.#{tsk[1]}-medoids.tsv", home)
99
100
  fh = File.open(results, 'w')
@@ -111,7 +112,9 @@ class MiGA::DistanceRunner
111
112
  next unless r[1].to_i == val_cls
112
113
 
113
114
  ani = ani_after_aai(ref_project.dataset(r[0]), 80.0)
114
- closest = { ds: r[0], ani: ani } unless ani.nil? or ani < closest[:ani]
115
+ unless ani.nil? || ani < closest[:ani]
116
+ closest = { ds: r[0], ani: ani }
117
+ end
115
118
  end
116
119
  end
117
120
  end
@@ -133,7 +136,7 @@ class MiGA::DistanceRunner
133
136
 
134
137
  # Launch analysis for taxonomy jobs
135
138
  def go_taxonomy!
136
- $stderr.puts "Launching taxonomy analysis"
139
+ $stderr.puts 'Launching taxonomy analysis'
137
140
  return unless project.metadata[:ref_project]
138
141
 
139
142
  go_query! # <- yeah, it's actually the same, just different ref_project
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.7.0
4
+ version: 0.7.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-04 00:00:00.000000000 Z
11
+ date: 2020-07-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons
@@ -529,7 +529,7 @@ homepage: http://enve-omics.ce.gatech.edu/miga
529
529
  licenses:
530
530
  - Artistic-2.0
531
531
  metadata: {}
532
- post_install_message:
532
+ post_install_message:
533
533
  rdoc_options:
534
534
  - lib
535
535
  - README.md
@@ -550,8 +550,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
550
550
  - !ruby/object:Gem::Version
551
551
  version: '0'
552
552
  requirements: []
553
- rubygems_version: 3.0.3
554
- signing_key:
553
+ rubygems_version: 3.1.2
554
+ signing_key:
555
555
  specification_version: 4
556
556
  summary: MiGA
557
557
  test_files: []