miga-base 1.0.5.5 → 1.1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 60c02b77c4b06c76c050a87a0506334a469f95684c4032a0d1a4e8bc69fec387
4
- data.tar.gz: 2464be0080e24e5c61067f6810dabd78f09ca0a7f13dafa1e049dd7b7d531233
3
+ metadata.gz: 4076b3b3a4a4143ac9100ce4d58fada7615f68ad3e6174445510655f62904867
4
+ data.tar.gz: '0975a5feb4c9eb71a474be87dd14b58297ef1aa7bd8612c20f1ce65febbdf980'
5
5
  SHA512:
6
- metadata.gz: 7a6e6e46fe291be06d7f9a2b91904b76170b7251861630227bbbbdf17c24a8cc3fa3b97bab83aaa98a2c46d994ddfc6707cc922e3f18b17424d254f09773c76b
7
- data.tar.gz: f8f6fa75ff6f58562f9869122e43de7fa091925cfab4e2606011b1afa2fee0439f64ef87188ba55d62f0e98e4afc5c727b85b053aa2424cbe602cb4bdda1cc04
6
+ metadata.gz: ebcb7fe28d415ca9709433975585518eb1ecd8e8270c584b6579da222e4d3733cc20d810787c3f764f6a6136e1a6f09b7cb6b1c00114c3ea9c0885370654f3a7
7
+ data.tar.gz: '082bd856ed21487e5de709e2067f1d3453f824e0ece7a77716c6fbe70d88a16c4d295196d5c6133e5667142e25f55f7e48e4a785afd160a14e8195a9b7efa6c2'
@@ -46,7 +46,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
46
46
  dist: ['distances', 'Check distance summary tables'],
47
47
  files: ['files', 'Check for outdated files'],
48
48
  cds: ['cds', 'Check for gzipped genes and proteins'],
49
- ess: ['essential-genes', 'Check for unarchived essential genes'],
49
+ ess: ['essential-genes', 'Check for outdated essential genes'],
50
50
  mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
51
51
  start: ['start', 'Check for lingering .start files'],
52
52
  tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
@@ -252,16 +252,16 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
252
252
  ##
253
253
  # Perform essential-genes operation with MiGA::Cli +cli+
254
254
  def check_ess(cli)
255
- cli.say 'Looking for unarchived essential genes'
255
+ cli.say 'Looking for outdated essential genes'
256
256
  cli.load_project.each_dataset do |d|
257
257
  res = d.result(:essential_genes)
258
258
  next if res.nil?
259
259
 
260
260
  dir = res.file_path(:collection)
261
- if dir.nil?
261
+ if dir.nil? || outdated_fastaai_ess(res)
262
262
  cli.say " > Removing #{d.name}:essential_genes"
263
263
  res.remove!
264
- sr = d.result(:stats) and sr.remove!
264
+ d.result(:stats)&.remove!
265
265
  next
266
266
  end
267
267
  next if Dir["#{dir}/*.faa"].empty?
@@ -272,6 +272,14 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
272
272
  end
273
273
  end
274
274
 
275
+ ##
276
+ # Check if the essential genes result +res+ has an outdated FastAAI index
277
+ def outdated_fastaai_ess(res)
278
+ idx1 = res.file_path(:fastaai_index)
279
+ idx2 = res.file_path(:fastaai_index_2)
280
+ idx2.nil? && !idx1.nil?
281
+ end
282
+
275
283
  ##
276
284
  # Perform mytaxa-scan operation with MiGA::Cli +cli+
277
285
  def check_mts(cli)
@@ -15,7 +15,7 @@ class MiGA::Cli::Action::Env < MiGA::Cli::Action
15
15
  . "$MIGA_HOME/.miga_rc"
16
16
  # Ensure MiGA & submodules are first in PATH
17
17
  export PATH="$MIGA/bin:$PATH"
18
- for util in enveomics/Scripts FastAAI/FastAAI multitrim ; do
18
+ for util in enveomics/Scripts FastAAI/FastAAI FastAAI multitrim ; do
19
19
  export PATH="$MIGA/utils/$util:$PATH"
20
20
  done
21
21
  BASH
@@ -181,7 +181,7 @@ class MiGA::Cli::Action::Init < MiGA::Cli::Action
181
181
  req_libraries = {
182
182
  r: %w[ape cluster vegan],
183
183
  ruby: %w[sqlite3 daemons json],
184
- python: %w[numpy]
184
+ python: %w[numpy sqlite3]
185
185
  }
186
186
 
187
187
  req_libraries.each do |language, libraries|
@@ -0,0 +1,230 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'miga/remote_dataset'
4
+ require 'csv'
5
+
6
+ ##
7
+ # Helper module including download functions for the ncbi_get action
8
+ module MiGA::Cli::Action::NcbiGet::Downloads
9
+ def cli_task_flags(opt)
10
+ cli.opt_flag(
11
+ opt, 'reference',
12
+ 'Download all reference genomes (ignore any other status)'
13
+ )
14
+ cli.opt_flag(opt, 'complete', 'Download complete genomes')
15
+ cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes')
16
+ cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds')
17
+ cli.opt_flag(opt, 'contig', 'Download genomes in contigs')
18
+ opt.on(
19
+ '--all',
20
+ 'Download all genomes (in any status)'
21
+ ) do
22
+ cli[:complete] = true
23
+ cli[:chromosome] = true
24
+ cli[:scaffold] = true
25
+ cli[:contig] = true
26
+ end
27
+ end
28
+
29
+ def cli_name_modifiers(opt)
30
+ opt.on(
31
+ '--no-version-name',
32
+ 'Do not add sequence version to the dataset name',
33
+ 'Only affects --complete and --chromosome'
34
+ ) { |v| cli[:add_version] = v }
35
+ cli.opt_flag(
36
+ opt, 'legacy-name',
37
+ 'Use dataset names based on chromosome entries instead of assembly',
38
+ :legacy_name
39
+ )
40
+ end
41
+
42
+ def cli_filters(opt)
43
+ opt.on(
44
+ '--blacklist PATH',
45
+ 'A file with dataset names to blacklist'
46
+ ) { |v| cli[:blacklist] = v }
47
+ cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
48
+ opt.on(
49
+ '--ignore-until STRING',
50
+ 'Ignores all datasets until a name is found (useful for large reruns)'
51
+ ) { |v| cli[:ignore_until] = v }
52
+ cli.opt_flag(
53
+ opt, 'get-metadata',
54
+ 'Only download and update metadata for existing datasets', :get_md
55
+ )
56
+ end
57
+
58
+ def cli_save_actions(opt)
59
+ cli.opt_flag(
60
+ opt, 'only-metadata',
61
+ 'Create datasets without input data but retrieve all metadata',
62
+ :only_md
63
+ )
64
+ opt.on(
65
+ '--save-every INT', Integer,
66
+ 'Save project every this many downloaded datasets',
67
+ 'If zero, it saves the project only once upon completion',
68
+ "By default: #{cli[:save_every]}"
69
+ ) { |v| cli[:save_every] = v }
70
+ opt.on(
71
+ '-q', '--query',
72
+ 'Register the datasets as queries, not reference datasets'
73
+ ) { |v| cli[:query] = v }
74
+ opt.on(
75
+ '-u', '--unlink',
76
+ 'Unlink all datasets in the project missing from the download list'
77
+ ) { |v| cli[:unlink] = v }
78
+ opt.on(
79
+ '-R', '--remote-list PATH',
80
+ 'Path to an output file with the list of all datasets listed remotely'
81
+ ) { |v| cli[:remote_list] = v }
82
+ end
83
+
84
+ def sanitize_cli
85
+ cli.ensure_par(taxon: '-T')
86
+ tasks = %w[reference complete chromosome scaffold contig]
87
+ unless tasks.any? { |i| cli[i.to_sym] }
88
+ raise 'No action requested: pick at least one type of genome'
89
+ end
90
+
91
+ cli[:save_every] = 1 if cli[:dry]
92
+ end
93
+
94
+ def remote_list
95
+ cli.say 'Downloading genome list'
96
+ ds = {}
97
+ url = remote_list_url
98
+ doc = MiGA::RemoteDataset.download_url(url)
99
+ CSV.parse(doc, headers: true).each do |r|
100
+ asm = r['assembly']
101
+ next if asm.nil? || asm.empty? || asm == '-'
102
+ next unless r['ftp_path_genbank']
103
+
104
+ rep = remote_row_replicons(r)
105
+ n = remote_row_name(r, rep, asm)
106
+
107
+ # Register for download
108
+ fna_url = '%s/%s_genomic.fna.gz' %
109
+ [r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])]
110
+ ds[n] = {
111
+ ids: [fna_url], db: :assembly_gz, universe: :web,
112
+ md: {
113
+ type: :genome, ncbi_asm: asm, strain: r['strain']
114
+ }
115
+ }
116
+ ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
117
+ unless r['release_date'].nil?
118
+ ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
119
+ end
120
+ end
121
+ ds
122
+ end
123
+
124
+ def remote_row_replicons(r)
125
+ return if r['replicons'].nil?
126
+
127
+ r['replicons']
128
+ .split('; ')
129
+ .map { |i| i.gsub(/.*:/, '') }
130
+ .map { |i| i.gsub(%r{/.*}, '') }
131
+ end
132
+
133
+ def remote_row_name(r, rep, asm)
134
+ return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
135
+
136
+ if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
137
+ acc = rep.nil? ? '' : rep.first
138
+ else
139
+ acc = asm
140
+ end
141
+ acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
142
+ "#{r['#organism']}_#{acc}".miga_name
143
+ end
144
+
145
+ def remote_list_url
146
+ url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
147
+ url_param = {
148
+ q: '[display()].' \
149
+ 'from(GenomeAssemblies).' \
150
+ 'usingschema(/schema/GenomeAssemblies).' \
151
+ 'matching(tab==["Prokaryotes"] and q=="' \
152
+ "#{cli[:taxon]&.tr('"', "'")}\"",
153
+ fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
154
+ 'level|level,ftp_path_genbank|ftp_path_genbank,' \
155
+ 'release_date|release_date,strain|strain',
156
+ nolimit: 'on'
157
+ }
158
+ if cli[:reference]
159
+ url_param[:q] += ' and refseq_category==["representative"]'
160
+ else
161
+ status = {
162
+ complete: 'Complete',
163
+ chromosome: ' Chromosome', # <- The leading space is *VERY* important!
164
+ scaffold: 'Scaffold',
165
+ contig: 'Contig'
166
+ }.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
167
+ url_param[:q] += ' and level==[' + status + ']'
168
+ end
169
+ url_param[:q] += ')'
170
+ url_base + URI.encode_www_form(url_param)
171
+ end
172
+
173
+ def discard_blacklisted(ds)
174
+ unless cli[:blacklist].nil?
175
+ cli.say "Discarding datasets in #{cli[:blacklist]}"
176
+ File.readlines(cli[:blacklist])
177
+ .select { |i| i !~ /^#/ }
178
+ .map(&:chomp)
179
+ .each { |i| ds.delete i }
180
+ end
181
+ ds
182
+ end
183
+
184
+ def impose_limit(ds)
185
+ max = cli[:max_datasets].to_i
186
+ if !max.zero? && max < ds.size
187
+ cli.say "Subsampling list from #{ds.size} to #{max} datasets"
188
+ sample = ds.keys.sample(max)
189
+ ds.select! { |k, _| sample.include? k }
190
+ end
191
+ ds
192
+ end
193
+
194
+ def download_entries(ds, p)
195
+ cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
196
+ p.do_not_save = true if cli[:save_every] != 1
197
+ ignore = !cli[:ignore_until].nil?
198
+ downloaded = 0
199
+ d = []
200
+ ds.each do |name, body|
201
+ d << name
202
+ cli.puts name
203
+ ignore = false if ignore && name == cli[:ignore_until]
204
+ next if ignore || p.dataset(name).nil? == cli[:get_md]
205
+
206
+ downloaded += 1
207
+ unless cli[:dry]
208
+ save_entry(name, body, p)
209
+ p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
210
+ end
211
+ end
212
+ p.do_not_save = false
213
+ p.save! if cli[:save_every] != 1
214
+ [d, downloaded]
215
+ end
216
+
217
+ def save_entry(name, body, p)
218
+ cli.say ' Locating remote dataset'
219
+ body[:md][:metadata_only] = true if cli[:only_md]
220
+ rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
221
+ if cli[:get_md]
222
+ cli.say ' Updating dataset'
223
+ rd.update_metadata(p.dataset(name), body[:md])
224
+ else
225
+ cli.say ' Creating dataset'
226
+ rd.save_to(p, name, !cli[:query], body[:md])
227
+ cli.add_metadata(p.add_dataset(name))
228
+ end
229
+ end
230
+ end
@@ -1,11 +1,11 @@
1
- # @package MiGA
2
- # @license Artistic-2.0
1
+ # frozen_string_literal: true
3
2
 
4
3
  require 'miga/cli/action'
5
- require 'miga/remote_dataset'
6
- require 'csv'
7
4
 
8
5
  class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
6
+ require 'miga/cli/action/ncbi_get/downloads'
7
+ include MiGA::Cli::Action::NcbiGet::Downloads
8
+
9
9
  def parse_cli
10
10
  cli.defaults = {
11
11
  query: false, unlink: false,
@@ -20,6 +20,10 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
20
20
  '-T', '--taxon STRING',
21
21
  '(Mandatory) Taxon name (e.g., a species binomial)'
22
22
  ) { |v| cli[:taxon] = v }
23
+ opt.on(
24
+ '--max INT', Integer,
25
+ 'Maximum number of datasets to download (by default: unlimited)'
26
+ ) { |v| cli[:max_datasets] = v }
23
27
  opt.on(
24
28
  '-m', '--metadata STRING',
25
29
  'Metadata as key-value pairs separated by = and delimited by comma',
@@ -41,6 +45,7 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
41
45
  p = cli.load_project
42
46
  ds = remote_list
43
47
  ds = discard_blacklisted(ds)
48
+ ds = impose_limit(ds)
44
49
  d, downloaded = download_entries(ds, p)
45
50
 
46
51
  # Finalize
@@ -59,217 +64,4 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
59
64
  cli.say "Datasets unlinked: #{unlink.size}"
60
65
  end
61
66
 
62
- private
63
-
64
- def cli_task_flags(opt)
65
- cli.opt_flag(
66
- opt, 'reference',
67
- 'Download all reference genomes (ignore any other status)'
68
- )
69
- cli.opt_flag(opt, 'complete', 'Download complete genomes')
70
- cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes')
71
- cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds')
72
- cli.opt_flag(opt, 'contig', 'Download genomes in contigs')
73
- opt.on(
74
- '--all',
75
- 'Download all genomes (in any status)'
76
- ) do
77
- cli[:complete] = true
78
- cli[:chromosome] = true
79
- cli[:scaffold] = true
80
- cli[:contig] = true
81
- end
82
- end
83
-
84
- def cli_name_modifiers(opt)
85
- opt.on(
86
- '--no-version-name',
87
- 'Do not add sequence version to the dataset name',
88
- 'Only affects --complete and --chromosome'
89
- ) { |v| cli[:add_version] = v }
90
- cli.opt_flag(
91
- opt, 'legacy-name',
92
- 'Use dataset names based on chromosome entries instead of assembly',
93
- :legacy_name
94
- )
95
- end
96
-
97
- def cli_filters(opt)
98
- opt.on(
99
- '--blacklist PATH',
100
- 'A file with dataset names to blacklist'
101
- ) { |v| cli[:blacklist] = v }
102
- cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
103
- opt.on(
104
- '--ignore-until STRING',
105
- 'Ignores all datasets until a name is found (useful for large reruns)'
106
- ) { |v| cli[:ignore_until] = v }
107
- cli.opt_flag(
108
- opt, 'get-metadata',
109
- 'Only download and update metadata for existing datasets', :get_md
110
- )
111
- end
112
-
113
- def cli_save_actions(opt)
114
- cli.opt_flag(
115
- opt, 'only-metadata',
116
- 'Create datasets without input data but retrieve all metadata',
117
- :only_md
118
- )
119
- opt.on(
120
- '--save-every INT', Integer,
121
- 'Save project every this many downloaded datasets',
122
- 'If zero, it saves the project only once upon completion',
123
- "By default: #{cli[:save_every]}"
124
- ) { |v| cli[:save_every] = v }
125
- opt.on(
126
- '-q', '--query',
127
- 'Register the datasets as queries, not reference datasets'
128
- ) { |v| cli[:query] = v }
129
- opt.on(
130
- '-u', '--unlink',
131
- 'Unlink all datasets in the project missing from the download list'
132
- ) { |v| cli[:unlink] = v }
133
- opt.on(
134
- '-R', '--remote-list PATH',
135
- 'Path to an output file with the list of all datasets listed remotely'
136
- ) { |v| cli[:remote_list] = v }
137
- end
138
-
139
- def sanitize_cli
140
- cli.ensure_par(taxon: '-T')
141
- tasks = %w[reference complete chromosome scaffold contig]
142
- unless tasks.any? { |i| cli[i.to_sym] }
143
- raise 'No action requested: pick at least one type of genome'
144
- end
145
-
146
- cli[:save_every] = 1 if cli[:dry]
147
- end
148
-
149
- def remote_list
150
- cli.say 'Downloading genome list'
151
- ds = {}
152
- url = remote_list_url
153
- doc = RemoteDataset.download_url(url)
154
- CSV.parse(doc, headers: true).each do |r|
155
- asm = r['assembly']
156
- next if asm.nil? || asm.empty? || asm == '-'
157
- next unless r['ftp_path_genbank']
158
-
159
- rep = remote_row_replicons(r)
160
- n = remote_row_name(r, rep, asm)
161
-
162
- # Register for download
163
- fna_url = '%s/%s_genomic.fna.gz' %
164
- [r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])]
165
- ds[n] = {
166
- ids: [fna_url], db: :assembly_gz, universe: :web,
167
- md: {
168
- type: :genome, ncbi_asm: asm, strain: r['strain']
169
- }
170
- }
171
- ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
172
- unless r['release_date'].nil?
173
- ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
174
- end
175
- end
176
- ds
177
- end
178
-
179
- def remote_row_replicons(r)
180
- return if r['replicons'].nil?
181
-
182
- r['replicons']
183
- .split('; ')
184
- .map { |i| i.gsub(/.*:/, '') }
185
- .map { |i| i.gsub(%r{/.*}, '') }
186
- end
187
-
188
- def remote_row_name(r, rep, asm)
189
- return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
190
-
191
- if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
192
- acc = rep.nil? ? '' : rep.first
193
- else
194
- acc = asm
195
- end
196
- acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
197
- "#{r['#organism']}_#{acc}".miga_name
198
- end
199
-
200
- def remote_list_url
201
- url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
202
- url_param = {
203
- q: '[display()].' \
204
- 'from(GenomeAssemblies).' \
205
- 'usingschema(/schema/GenomeAssemblies).' \
206
- 'matching(tab==["Prokaryotes"] and q=="' \
207
- "#{cli[:taxon]&.tr('"', "'")}\"",
208
- fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
209
- 'level|level,ftp_path_genbank|ftp_path_genbank,' \
210
- 'release_date|release_date,strain|strain',
211
- nolimit: 'on'
212
- }
213
- if cli[:reference]
214
- url_param[:q] += ' and refseq_category==["representative"]'
215
- else
216
- status = {
217
- complete: 'Complete',
218
- chromosome: ' Chromosome', # <- The leading space is *VERY* important!
219
- scaffold: 'Scaffold',
220
- contig: 'Contig'
221
- }.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
222
- url_param[:q] += ' and level==[' + status + ']'
223
- end
224
- url_param[:q] += ')'
225
- url_base + URI.encode_www_form(url_param)
226
- end
227
-
228
- def discard_blacklisted(ds)
229
- unless cli[:blacklist].nil?
230
- cli.say "Discarding datasets in #{cli[:blacklist]}"
231
- File.readlines(cli[:blacklist])
232
- .select { |i| i !~ /^#/ }
233
- .map(&:chomp)
234
- .each { |i| ds.delete i }
235
- end
236
- ds
237
- end
238
-
239
- def download_entries(ds, p)
240
- cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
241
- p.do_not_save = true if cli[:save_every] != 1
242
- ignore = !cli[:ignore_until].nil?
243
- downloaded = 0
244
- d = []
245
- ds.each do |name, body|
246
- d << name
247
- cli.puts name
248
- ignore = false if ignore && name == cli[:ignore_until]
249
- next if ignore || p.dataset(name).nil? == cli[:get_md]
250
-
251
- downloaded += 1
252
- unless cli[:dry]
253
- save_entry(name, body, p)
254
- p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
255
- end
256
- end
257
- p.do_not_save = false
258
- p.save! if cli[:save_every] != 1
259
- [d, downloaded]
260
- end
261
-
262
- def save_entry(name, body, p)
263
- cli.say ' Locating remote dataset'
264
- body[:md][:metadata_only] = true if cli[:only_md]
265
- rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
266
- if cli[:get_md]
267
- cli.say ' Updating dataset'
268
- rd.update_metadata(p.dataset(name), body[:md])
269
- else
270
- cli.say ' Creating dataset'
271
- rd.save_to(p, name, !cli[:query], body[:md])
272
- cli.add_metadata(p.add_dataset(name))
273
- end
274
- end
275
67
  end
@@ -38,6 +38,10 @@ module MiGA::Cli::Action::Wf
38
38
  '--no-draft',
39
39
  'Only download complete genomes, not drafts'
40
40
  ) { |v| cli[:ncbi_draft] = v }
41
+ opt.on(
42
+ '--max-download INT', Integer,
43
+ 'Maximum number of genomes to download (by default: unlimited)'
44
+ ) { |v| cli[:ncbi_max] = v }
41
45
  end
42
46
  if params[:qual]
43
47
  opt.on(
@@ -125,9 +129,9 @@ module MiGA::Cli::Action::Wf
125
129
  # Download datasets
126
130
  unless cli[:ncbi_taxon].nil?
127
131
  what = cli[:ncbi_draft] ? '--all' : '--complete'
128
- call_cli(
129
- ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
130
- )
132
+ cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
133
+ cmd += ['--max', cli[:ncbi_max]] if cli[:ncbi_max]
134
+ call_cli(cmd)
131
135
  end
132
136
 
133
137
  # Add datasets
data/lib/miga/common.rb CHANGED
@@ -53,11 +53,11 @@ class MiGA::MiGA
53
53
  # Reports the advance of a task at +step+ (String), the +n+ out of +total+.
54
54
  # The advance is reported in powers of 1,024 if +bin+ is true, or powers of
55
55
  # 1,000 otherwise.
56
- # The report goes to $stderr iff --verborse
56
+ # The report goes to $stderr iff --verbose
57
57
  def advance(step, n = 0, total = nil, bin = true)
58
58
  # Initialize advance timing
59
59
  @_advance_time ||= { last: nil, n: 0, avg: nil }
60
- if n <= 1 || @_advance_time[:n] > n
60
+ if @_advance_time[:n] > n
61
61
  @_advance_time[:last] = nil
62
62
  @_advance_time[:n] = 0
63
63
  @_advance_time[:avg] = nil
@@ -65,16 +65,17 @@ class MiGA::MiGA
65
65
 
66
66
  # Estimate timing
67
67
  adv_n = n - @_advance_time[:n]
68
- unless total.nil? || @_advance_time[:last].nil? || adv_n <= 0
69
- if adv_n.to_f/n > 0.001
70
- this_time = (Time.now - @_advance_time[:last]).to_f
71
- this_avg = this_time / adv_n
72
- @_advance_time[:avg] ||= this_avg
73
- @_advance_time[:avg] = 0.9 * @_advance_time[:avg] + 0.1 * this_avg
74
- end
68
+ if total.nil? || @_advance_time[:last].nil? || adv_n.negative?
69
+ @_advance_time[:last] = Time.now
70
+ @_advance_time[:n] = n
71
+ elsif adv_n > 0.001 * total
72
+ this_time = (Time.now - @_advance_time[:last]).to_f
73
+ this_avg = this_time / adv_n
74
+ @_advance_time[:avg] ||= this_avg
75
+ @_advance_time[:avg] = 0.9 * @_advance_time[:avg] + 0.1 * this_avg
76
+ @_advance_time[:last] = Time.now
77
+ @_advance_time[:n] = n
75
78
  end
76
- @_advance_time[:last] = Time.now
77
- @_advance_time[:n] = n
78
79
 
79
80
  # Report
80
81
  adv =
@@ -281,7 +281,8 @@ module MiGA::Dataset::Result
281
281
  collection: '.ess',
282
282
  report: '.ess/log',
283
283
  alignments: '.ess/proteins.aln',
284
- fastaai_index: '.faix.db.gz'
284
+ fastaai_index: '.faix.db.gz',
285
+ fastaai_index_2: '.faix'
285
286
  )
286
287
  end
287
288
 
data/lib/miga/version.rb CHANGED
@@ -12,15 +12,15 @@ module MiGA
12
12
  # - String indicating release status:
13
13
  # - rc* release candidate, not released as gem
14
14
  # - [0-9]+ stable release, released as gem
15
- VERSION = [1.0, 5, 5].freeze
15
+ VERSION = [1.1, 0, 0].freeze
16
16
 
17
17
  ##
18
18
  # Nickname for the current major.minor version.
19
19
  VERSION_NAME = 'prima'
20
20
 
21
21
  ##
22
- # Date of the current gem release.
23
- VERSION_DATE = Date.new(2021, 9, 14)
22
+ # Date of the current gem relese.
23
+ VERSION_DATE = Date.new(2021, 10, 28)
24
24
 
25
25
  ##
26
26
  # References of MiGA
@@ -36,19 +36,15 @@ HMM.essential.rb \
36
36
  > "${DATASET}.ess/log"
37
37
 
38
38
  # Index for FastAAI
39
- NOMULTI=$(miga list_datasets -P "$PROJECT" -D "$DATASET" --no-multi \
39
+ NOMULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --no-multi \
40
40
  | wc -l | awk '{print $1}')
41
41
  if [[ "$NOMULTI" -eq "1" ]] ; then
42
- if [[ "$FAA" == *.gz ]] ; then
43
- gzip -cd "$FAA" > "${DATASET}.faix"
44
- else
45
- cp "$FAA" "${DATASET}.faix"
46
- fi
47
- FastAAI --qp "${DATASET}.faix" --output "${DATASET}.faix" \
48
- --ext ".faix" --index --input-paths --all-vs-all --threads "$CORES"
49
- rm "${DATASET}.faix"
50
- rm "${DATASET}.faix.hmm"
51
- rm "${DATASET}.faix.hmm.filt"
42
+ echo "$FAA" > "$DATASET"
43
+ FastAAI build_db --protein_file "$DATASET" \
44
+ -o "${DATASET}.faix.d" --threads "$CORES"
45
+ rm "$DATASET"
46
+ mv "${DATASET}.faix.d/database/FastAAI_database.sqlite.db" "${DATASET}.faix"
47
+ rm -r "${DATASET}.faix.d"
52
48
  fi
53
49
 
54
50
  # Reduce files