miga-base 1.0.5.5 → 1.1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor.rb +12 -4
- data/lib/miga/cli/action/env.rb +1 -1
- data/lib/miga/cli/action/init.rb +1 -1
- data/lib/miga/cli/action/ncbi_get/downloads.rb +230 -0
- data/lib/miga/cli/action/ncbi_get.rb +9 -217
- data/lib/miga/cli/action/wf.rb +7 -3
- data/lib/miga/common.rb +12 -11
- data/lib/miga/dataset/result.rb +2 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/essential_genes.bash +7 -11
- data/test/common_test.rb +7 -7
- data/utils/FastAAI/FastAAI +3630 -0
- data/utils/FastAAI/{FastAAI → FastAAI-legacy}/FastAAI +1 -1
- data/utils/FastAAI/{kAAI_v1.0_virus.py → FastAAI-legacy/kAAI_v1.0_virus.py} +0 -0
- data/utils/distance/commands.rb +24 -13
- metadata +6 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4076b3b3a4a4143ac9100ce4d58fada7615f68ad3e6174445510655f62904867
|
4
|
+
data.tar.gz: '0975a5feb4c9eb71a474be87dd14b58297ef1aa7bd8612c20f1ce65febbdf980'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ebcb7fe28d415ca9709433975585518eb1ecd8e8270c584b6579da222e4d3733cc20d810787c3f764f6a6136e1a6f09b7cb6b1c00114c3ea9c0885370654f3a7
|
7
|
+
data.tar.gz: '082bd856ed21487e5de709e2067f1d3453f824e0ece7a77716c6fbe70d88a16c4d295196d5c6133e5667142e25f55f7e48e4a785afd160a14e8195a9b7efa6c2'
|
@@ -46,7 +46,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
46
46
|
dist: ['distances', 'Check distance summary tables'],
|
47
47
|
files: ['files', 'Check for outdated files'],
|
48
48
|
cds: ['cds', 'Check for gzipped genes and proteins'],
|
49
|
-
ess: ['essential-genes', 'Check for
|
49
|
+
ess: ['essential-genes', 'Check for outdated essential genes'],
|
50
50
|
mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
|
51
51
|
start: ['start', 'Check for lingering .start files'],
|
52
52
|
tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
|
@@ -252,16 +252,16 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
252
252
|
##
|
253
253
|
# Perform essential-genes operation with MiGA::Cli +cli+
|
254
254
|
def check_ess(cli)
|
255
|
-
cli.say 'Looking for
|
255
|
+
cli.say 'Looking for outdated essential genes'
|
256
256
|
cli.load_project.each_dataset do |d|
|
257
257
|
res = d.result(:essential_genes)
|
258
258
|
next if res.nil?
|
259
259
|
|
260
260
|
dir = res.file_path(:collection)
|
261
|
-
if dir.nil?
|
261
|
+
if dir.nil? || outdated_fastaai_ess(res)
|
262
262
|
cli.say " > Removing #{d.name}:essential_genes"
|
263
263
|
res.remove!
|
264
|
-
|
264
|
+
d.result(:stats)&.remove!
|
265
265
|
next
|
266
266
|
end
|
267
267
|
next if Dir["#{dir}/*.faa"].empty?
|
@@ -272,6 +272,14 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
272
272
|
end
|
273
273
|
end
|
274
274
|
|
275
|
+
##
|
276
|
+
# Check if the essential genes result +res+ has an outdated FastAAI index
|
277
|
+
def outdated_fastaai_ess(res)
|
278
|
+
idx1 = res.file_path(:fastaai_index)
|
279
|
+
idx2 = res.file_path(:fastaai_index_2)
|
280
|
+
idx2.nil? && !idx1.nil?
|
281
|
+
end
|
282
|
+
|
275
283
|
##
|
276
284
|
# Perform mytaxa-scan operation with MiGA::Cli +cli+
|
277
285
|
def check_mts(cli)
|
data/lib/miga/cli/action/env.rb
CHANGED
@@ -15,7 +15,7 @@ class MiGA::Cli::Action::Env < MiGA::Cli::Action
|
|
15
15
|
. "$MIGA_HOME/.miga_rc"
|
16
16
|
# Ensure MiGA & submodules are first in PATH
|
17
17
|
export PATH="$MIGA/bin:$PATH"
|
18
|
-
for util in enveomics/Scripts FastAAI/FastAAI multitrim ; do
|
18
|
+
for util in enveomics/Scripts FastAAI/FastAAI FastAAI multitrim ; do
|
19
19
|
export PATH="$MIGA/utils/$util:$PATH"
|
20
20
|
done
|
21
21
|
BASH
|
data/lib/miga/cli/action/init.rb
CHANGED
@@ -0,0 +1,230 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'miga/remote_dataset'
|
4
|
+
require 'csv'
|
5
|
+
|
6
|
+
##
|
7
|
+
# Helper module including download functions for the ncbi_get action
|
8
|
+
module MiGA::Cli::Action::NcbiGet::Downloads
|
9
|
+
def cli_task_flags(opt)
|
10
|
+
cli.opt_flag(
|
11
|
+
opt, 'reference',
|
12
|
+
'Download all reference genomes (ignore any other status)'
|
13
|
+
)
|
14
|
+
cli.opt_flag(opt, 'complete', 'Download complete genomes')
|
15
|
+
cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes')
|
16
|
+
cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds')
|
17
|
+
cli.opt_flag(opt, 'contig', 'Download genomes in contigs')
|
18
|
+
opt.on(
|
19
|
+
'--all',
|
20
|
+
'Download all genomes (in any status)'
|
21
|
+
) do
|
22
|
+
cli[:complete] = true
|
23
|
+
cli[:chromosome] = true
|
24
|
+
cli[:scaffold] = true
|
25
|
+
cli[:contig] = true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def cli_name_modifiers(opt)
|
30
|
+
opt.on(
|
31
|
+
'--no-version-name',
|
32
|
+
'Do not add sequence version to the dataset name',
|
33
|
+
'Only affects --complete and --chromosome'
|
34
|
+
) { |v| cli[:add_version] = v }
|
35
|
+
cli.opt_flag(
|
36
|
+
opt, 'legacy-name',
|
37
|
+
'Use dataset names based on chromosome entries instead of assembly',
|
38
|
+
:legacy_name
|
39
|
+
)
|
40
|
+
end
|
41
|
+
|
42
|
+
def cli_filters(opt)
|
43
|
+
opt.on(
|
44
|
+
'--blacklist PATH',
|
45
|
+
'A file with dataset names to blacklist'
|
46
|
+
) { |v| cli[:blacklist] = v }
|
47
|
+
cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
|
48
|
+
opt.on(
|
49
|
+
'--ignore-until STRING',
|
50
|
+
'Ignores all datasets until a name is found (useful for large reruns)'
|
51
|
+
) { |v| cli[:ignore_until] = v }
|
52
|
+
cli.opt_flag(
|
53
|
+
opt, 'get-metadata',
|
54
|
+
'Only download and update metadata for existing datasets', :get_md
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
def cli_save_actions(opt)
|
59
|
+
cli.opt_flag(
|
60
|
+
opt, 'only-metadata',
|
61
|
+
'Create datasets without input data but retrieve all metadata',
|
62
|
+
:only_md
|
63
|
+
)
|
64
|
+
opt.on(
|
65
|
+
'--save-every INT', Integer,
|
66
|
+
'Save project every this many downloaded datasets',
|
67
|
+
'If zero, it saves the project only once upon completion',
|
68
|
+
"By default: #{cli[:save_every]}"
|
69
|
+
) { |v| cli[:save_every] = v }
|
70
|
+
opt.on(
|
71
|
+
'-q', '--query',
|
72
|
+
'Register the datasets as queries, not reference datasets'
|
73
|
+
) { |v| cli[:query] = v }
|
74
|
+
opt.on(
|
75
|
+
'-u', '--unlink',
|
76
|
+
'Unlink all datasets in the project missing from the download list'
|
77
|
+
) { |v| cli[:unlink] = v }
|
78
|
+
opt.on(
|
79
|
+
'-R', '--remote-list PATH',
|
80
|
+
'Path to an output file with the list of all datasets listed remotely'
|
81
|
+
) { |v| cli[:remote_list] = v }
|
82
|
+
end
|
83
|
+
|
84
|
+
def sanitize_cli
|
85
|
+
cli.ensure_par(taxon: '-T')
|
86
|
+
tasks = %w[reference complete chromosome scaffold contig]
|
87
|
+
unless tasks.any? { |i| cli[i.to_sym] }
|
88
|
+
raise 'No action requested: pick at least one type of genome'
|
89
|
+
end
|
90
|
+
|
91
|
+
cli[:save_every] = 1 if cli[:dry]
|
92
|
+
end
|
93
|
+
|
94
|
+
def remote_list
|
95
|
+
cli.say 'Downloading genome list'
|
96
|
+
ds = {}
|
97
|
+
url = remote_list_url
|
98
|
+
doc = MiGA::RemoteDataset.download_url(url)
|
99
|
+
CSV.parse(doc, headers: true).each do |r|
|
100
|
+
asm = r['assembly']
|
101
|
+
next if asm.nil? || asm.empty? || asm == '-'
|
102
|
+
next unless r['ftp_path_genbank']
|
103
|
+
|
104
|
+
rep = remote_row_replicons(r)
|
105
|
+
n = remote_row_name(r, rep, asm)
|
106
|
+
|
107
|
+
# Register for download
|
108
|
+
fna_url = '%s/%s_genomic.fna.gz' %
|
109
|
+
[r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])]
|
110
|
+
ds[n] = {
|
111
|
+
ids: [fna_url], db: :assembly_gz, universe: :web,
|
112
|
+
md: {
|
113
|
+
type: :genome, ncbi_asm: asm, strain: r['strain']
|
114
|
+
}
|
115
|
+
}
|
116
|
+
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
117
|
+
unless r['release_date'].nil?
|
118
|
+
ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
|
119
|
+
end
|
120
|
+
end
|
121
|
+
ds
|
122
|
+
end
|
123
|
+
|
124
|
+
def remote_row_replicons(r)
|
125
|
+
return if r['replicons'].nil?
|
126
|
+
|
127
|
+
r['replicons']
|
128
|
+
.split('; ')
|
129
|
+
.map { |i| i.gsub(/.*:/, '') }
|
130
|
+
.map { |i| i.gsub(%r{/.*}, '') }
|
131
|
+
end
|
132
|
+
|
133
|
+
def remote_row_name(r, rep, asm)
|
134
|
+
return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
|
135
|
+
|
136
|
+
if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
|
137
|
+
acc = rep.nil? ? '' : rep.first
|
138
|
+
else
|
139
|
+
acc = asm
|
140
|
+
end
|
141
|
+
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
142
|
+
"#{r['#organism']}_#{acc}".miga_name
|
143
|
+
end
|
144
|
+
|
145
|
+
def remote_list_url
|
146
|
+
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
|
147
|
+
url_param = {
|
148
|
+
q: '[display()].' \
|
149
|
+
'from(GenomeAssemblies).' \
|
150
|
+
'usingschema(/schema/GenomeAssemblies).' \
|
151
|
+
'matching(tab==["Prokaryotes"] and q=="' \
|
152
|
+
"#{cli[:taxon]&.tr('"', "'")}\"",
|
153
|
+
fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
|
154
|
+
'level|level,ftp_path_genbank|ftp_path_genbank,' \
|
155
|
+
'release_date|release_date,strain|strain',
|
156
|
+
nolimit: 'on'
|
157
|
+
}
|
158
|
+
if cli[:reference]
|
159
|
+
url_param[:q] += ' and refseq_category==["representative"]'
|
160
|
+
else
|
161
|
+
status = {
|
162
|
+
complete: 'Complete',
|
163
|
+
chromosome: ' Chromosome', # <- The leading space is *VERY* important!
|
164
|
+
scaffold: 'Scaffold',
|
165
|
+
contig: 'Contig'
|
166
|
+
}.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
|
167
|
+
url_param[:q] += ' and level==[' + status + ']'
|
168
|
+
end
|
169
|
+
url_param[:q] += ')'
|
170
|
+
url_base + URI.encode_www_form(url_param)
|
171
|
+
end
|
172
|
+
|
173
|
+
def discard_blacklisted(ds)
|
174
|
+
unless cli[:blacklist].nil?
|
175
|
+
cli.say "Discarding datasets in #{cli[:blacklist]}"
|
176
|
+
File.readlines(cli[:blacklist])
|
177
|
+
.select { |i| i !~ /^#/ }
|
178
|
+
.map(&:chomp)
|
179
|
+
.each { |i| ds.delete i }
|
180
|
+
end
|
181
|
+
ds
|
182
|
+
end
|
183
|
+
|
184
|
+
def impose_limit(ds)
|
185
|
+
max = cli[:max_datasets].to_i
|
186
|
+
if !max.zero? && max < ds.size
|
187
|
+
cli.say "Subsampling list from #{ds.size} to #{max} datasets"
|
188
|
+
sample = ds.keys.sample(max)
|
189
|
+
ds.select! { |k, _| sample.include? k }
|
190
|
+
end
|
191
|
+
ds
|
192
|
+
end
|
193
|
+
|
194
|
+
def download_entries(ds, p)
|
195
|
+
cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
|
196
|
+
p.do_not_save = true if cli[:save_every] != 1
|
197
|
+
ignore = !cli[:ignore_until].nil?
|
198
|
+
downloaded = 0
|
199
|
+
d = []
|
200
|
+
ds.each do |name, body|
|
201
|
+
d << name
|
202
|
+
cli.puts name
|
203
|
+
ignore = false if ignore && name == cli[:ignore_until]
|
204
|
+
next if ignore || p.dataset(name).nil? == cli[:get_md]
|
205
|
+
|
206
|
+
downloaded += 1
|
207
|
+
unless cli[:dry]
|
208
|
+
save_entry(name, body, p)
|
209
|
+
p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
|
210
|
+
end
|
211
|
+
end
|
212
|
+
p.do_not_save = false
|
213
|
+
p.save! if cli[:save_every] != 1
|
214
|
+
[d, downloaded]
|
215
|
+
end
|
216
|
+
|
217
|
+
def save_entry(name, body, p)
|
218
|
+
cli.say ' Locating remote dataset'
|
219
|
+
body[:md][:metadata_only] = true if cli[:only_md]
|
220
|
+
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
221
|
+
if cli[:get_md]
|
222
|
+
cli.say ' Updating dataset'
|
223
|
+
rd.update_metadata(p.dataset(name), body[:md])
|
224
|
+
else
|
225
|
+
cli.say ' Creating dataset'
|
226
|
+
rd.save_to(p, name, !cli[:query], body[:md])
|
227
|
+
cli.add_metadata(p.add_dataset(name))
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
@@ -1,11 +1,11 @@
|
|
1
|
-
#
|
2
|
-
# @license Artistic-2.0
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
3
|
require 'miga/cli/action'
|
5
|
-
require 'miga/remote_dataset'
|
6
|
-
require 'csv'
|
7
4
|
|
8
5
|
class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
6
|
+
require 'miga/cli/action/ncbi_get/downloads'
|
7
|
+
include MiGA::Cli::Action::NcbiGet::Downloads
|
8
|
+
|
9
9
|
def parse_cli
|
10
10
|
cli.defaults = {
|
11
11
|
query: false, unlink: false,
|
@@ -20,6 +20,10 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
20
20
|
'-T', '--taxon STRING',
|
21
21
|
'(Mandatory) Taxon name (e.g., a species binomial)'
|
22
22
|
) { |v| cli[:taxon] = v }
|
23
|
+
opt.on(
|
24
|
+
'--max INT', Integer,
|
25
|
+
'Maximum number of datasets to download (by default: unlimited)'
|
26
|
+
) { |v| cli[:max_datasets] = v }
|
23
27
|
opt.on(
|
24
28
|
'-m', '--metadata STRING',
|
25
29
|
'Metadata as key-value pairs separated by = and delimited by comma',
|
@@ -41,6 +45,7 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
41
45
|
p = cli.load_project
|
42
46
|
ds = remote_list
|
43
47
|
ds = discard_blacklisted(ds)
|
48
|
+
ds = impose_limit(ds)
|
44
49
|
d, downloaded = download_entries(ds, p)
|
45
50
|
|
46
51
|
# Finalize
|
@@ -59,217 +64,4 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
59
64
|
cli.say "Datasets unlinked: #{unlink.size}"
|
60
65
|
end
|
61
66
|
|
62
|
-
private
|
63
|
-
|
64
|
-
def cli_task_flags(opt)
|
65
|
-
cli.opt_flag(
|
66
|
-
opt, 'reference',
|
67
|
-
'Download all reference genomes (ignore any other status)'
|
68
|
-
)
|
69
|
-
cli.opt_flag(opt, 'complete', 'Download complete genomes')
|
70
|
-
cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes')
|
71
|
-
cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds')
|
72
|
-
cli.opt_flag(opt, 'contig', 'Download genomes in contigs')
|
73
|
-
opt.on(
|
74
|
-
'--all',
|
75
|
-
'Download all genomes (in any status)'
|
76
|
-
) do
|
77
|
-
cli[:complete] = true
|
78
|
-
cli[:chromosome] = true
|
79
|
-
cli[:scaffold] = true
|
80
|
-
cli[:contig] = true
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
def cli_name_modifiers(opt)
|
85
|
-
opt.on(
|
86
|
-
'--no-version-name',
|
87
|
-
'Do not add sequence version to the dataset name',
|
88
|
-
'Only affects --complete and --chromosome'
|
89
|
-
) { |v| cli[:add_version] = v }
|
90
|
-
cli.opt_flag(
|
91
|
-
opt, 'legacy-name',
|
92
|
-
'Use dataset names based on chromosome entries instead of assembly',
|
93
|
-
:legacy_name
|
94
|
-
)
|
95
|
-
end
|
96
|
-
|
97
|
-
def cli_filters(opt)
|
98
|
-
opt.on(
|
99
|
-
'--blacklist PATH',
|
100
|
-
'A file with dataset names to blacklist'
|
101
|
-
) { |v| cli[:blacklist] = v }
|
102
|
-
cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
|
103
|
-
opt.on(
|
104
|
-
'--ignore-until STRING',
|
105
|
-
'Ignores all datasets until a name is found (useful for large reruns)'
|
106
|
-
) { |v| cli[:ignore_until] = v }
|
107
|
-
cli.opt_flag(
|
108
|
-
opt, 'get-metadata',
|
109
|
-
'Only download and update metadata for existing datasets', :get_md
|
110
|
-
)
|
111
|
-
end
|
112
|
-
|
113
|
-
def cli_save_actions(opt)
|
114
|
-
cli.opt_flag(
|
115
|
-
opt, 'only-metadata',
|
116
|
-
'Create datasets without input data but retrieve all metadata',
|
117
|
-
:only_md
|
118
|
-
)
|
119
|
-
opt.on(
|
120
|
-
'--save-every INT', Integer,
|
121
|
-
'Save project every this many downloaded datasets',
|
122
|
-
'If zero, it saves the project only once upon completion',
|
123
|
-
"By default: #{cli[:save_every]}"
|
124
|
-
) { |v| cli[:save_every] = v }
|
125
|
-
opt.on(
|
126
|
-
'-q', '--query',
|
127
|
-
'Register the datasets as queries, not reference datasets'
|
128
|
-
) { |v| cli[:query] = v }
|
129
|
-
opt.on(
|
130
|
-
'-u', '--unlink',
|
131
|
-
'Unlink all datasets in the project missing from the download list'
|
132
|
-
) { |v| cli[:unlink] = v }
|
133
|
-
opt.on(
|
134
|
-
'-R', '--remote-list PATH',
|
135
|
-
'Path to an output file with the list of all datasets listed remotely'
|
136
|
-
) { |v| cli[:remote_list] = v }
|
137
|
-
end
|
138
|
-
|
139
|
-
def sanitize_cli
|
140
|
-
cli.ensure_par(taxon: '-T')
|
141
|
-
tasks = %w[reference complete chromosome scaffold contig]
|
142
|
-
unless tasks.any? { |i| cli[i.to_sym] }
|
143
|
-
raise 'No action requested: pick at least one type of genome'
|
144
|
-
end
|
145
|
-
|
146
|
-
cli[:save_every] = 1 if cli[:dry]
|
147
|
-
end
|
148
|
-
|
149
|
-
def remote_list
|
150
|
-
cli.say 'Downloading genome list'
|
151
|
-
ds = {}
|
152
|
-
url = remote_list_url
|
153
|
-
doc = RemoteDataset.download_url(url)
|
154
|
-
CSV.parse(doc, headers: true).each do |r|
|
155
|
-
asm = r['assembly']
|
156
|
-
next if asm.nil? || asm.empty? || asm == '-'
|
157
|
-
next unless r['ftp_path_genbank']
|
158
|
-
|
159
|
-
rep = remote_row_replicons(r)
|
160
|
-
n = remote_row_name(r, rep, asm)
|
161
|
-
|
162
|
-
# Register for download
|
163
|
-
fna_url = '%s/%s_genomic.fna.gz' %
|
164
|
-
[r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])]
|
165
|
-
ds[n] = {
|
166
|
-
ids: [fna_url], db: :assembly_gz, universe: :web,
|
167
|
-
md: {
|
168
|
-
type: :genome, ncbi_asm: asm, strain: r['strain']
|
169
|
-
}
|
170
|
-
}
|
171
|
-
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
172
|
-
unless r['release_date'].nil?
|
173
|
-
ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
|
174
|
-
end
|
175
|
-
end
|
176
|
-
ds
|
177
|
-
end
|
178
|
-
|
179
|
-
def remote_row_replicons(r)
|
180
|
-
return if r['replicons'].nil?
|
181
|
-
|
182
|
-
r['replicons']
|
183
|
-
.split('; ')
|
184
|
-
.map { |i| i.gsub(/.*:/, '') }
|
185
|
-
.map { |i| i.gsub(%r{/.*}, '') }
|
186
|
-
end
|
187
|
-
|
188
|
-
def remote_row_name(r, rep, asm)
|
189
|
-
return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
|
190
|
-
|
191
|
-
if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
|
192
|
-
acc = rep.nil? ? '' : rep.first
|
193
|
-
else
|
194
|
-
acc = asm
|
195
|
-
end
|
196
|
-
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
197
|
-
"#{r['#organism']}_#{acc}".miga_name
|
198
|
-
end
|
199
|
-
|
200
|
-
def remote_list_url
|
201
|
-
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
|
202
|
-
url_param = {
|
203
|
-
q: '[display()].' \
|
204
|
-
'from(GenomeAssemblies).' \
|
205
|
-
'usingschema(/schema/GenomeAssemblies).' \
|
206
|
-
'matching(tab==["Prokaryotes"] and q=="' \
|
207
|
-
"#{cli[:taxon]&.tr('"', "'")}\"",
|
208
|
-
fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
|
209
|
-
'level|level,ftp_path_genbank|ftp_path_genbank,' \
|
210
|
-
'release_date|release_date,strain|strain',
|
211
|
-
nolimit: 'on'
|
212
|
-
}
|
213
|
-
if cli[:reference]
|
214
|
-
url_param[:q] += ' and refseq_category==["representative"]'
|
215
|
-
else
|
216
|
-
status = {
|
217
|
-
complete: 'Complete',
|
218
|
-
chromosome: ' Chromosome', # <- The leading space is *VERY* important!
|
219
|
-
scaffold: 'Scaffold',
|
220
|
-
contig: 'Contig'
|
221
|
-
}.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
|
222
|
-
url_param[:q] += ' and level==[' + status + ']'
|
223
|
-
end
|
224
|
-
url_param[:q] += ')'
|
225
|
-
url_base + URI.encode_www_form(url_param)
|
226
|
-
end
|
227
|
-
|
228
|
-
def discard_blacklisted(ds)
|
229
|
-
unless cli[:blacklist].nil?
|
230
|
-
cli.say "Discarding datasets in #{cli[:blacklist]}"
|
231
|
-
File.readlines(cli[:blacklist])
|
232
|
-
.select { |i| i !~ /^#/ }
|
233
|
-
.map(&:chomp)
|
234
|
-
.each { |i| ds.delete i }
|
235
|
-
end
|
236
|
-
ds
|
237
|
-
end
|
238
|
-
|
239
|
-
def download_entries(ds, p)
|
240
|
-
cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
|
241
|
-
p.do_not_save = true if cli[:save_every] != 1
|
242
|
-
ignore = !cli[:ignore_until].nil?
|
243
|
-
downloaded = 0
|
244
|
-
d = []
|
245
|
-
ds.each do |name, body|
|
246
|
-
d << name
|
247
|
-
cli.puts name
|
248
|
-
ignore = false if ignore && name == cli[:ignore_until]
|
249
|
-
next if ignore || p.dataset(name).nil? == cli[:get_md]
|
250
|
-
|
251
|
-
downloaded += 1
|
252
|
-
unless cli[:dry]
|
253
|
-
save_entry(name, body, p)
|
254
|
-
p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
|
255
|
-
end
|
256
|
-
end
|
257
|
-
p.do_not_save = false
|
258
|
-
p.save! if cli[:save_every] != 1
|
259
|
-
[d, downloaded]
|
260
|
-
end
|
261
|
-
|
262
|
-
def save_entry(name, body, p)
|
263
|
-
cli.say ' Locating remote dataset'
|
264
|
-
body[:md][:metadata_only] = true if cli[:only_md]
|
265
|
-
rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
266
|
-
if cli[:get_md]
|
267
|
-
cli.say ' Updating dataset'
|
268
|
-
rd.update_metadata(p.dataset(name), body[:md])
|
269
|
-
else
|
270
|
-
cli.say ' Creating dataset'
|
271
|
-
rd.save_to(p, name, !cli[:query], body[:md])
|
272
|
-
cli.add_metadata(p.add_dataset(name))
|
273
|
-
end
|
274
|
-
end
|
275
67
|
end
|
data/lib/miga/cli/action/wf.rb
CHANGED
@@ -38,6 +38,10 @@ module MiGA::Cli::Action::Wf
|
|
38
38
|
'--no-draft',
|
39
39
|
'Only download complete genomes, not drafts'
|
40
40
|
) { |v| cli[:ncbi_draft] = v }
|
41
|
+
opt.on(
|
42
|
+
'--max-download INT', Integer,
|
43
|
+
'Maximum number of genomes to download (by default: unlimited)'
|
44
|
+
) { |v| cli[:ncbi_max] = v }
|
41
45
|
end
|
42
46
|
if params[:qual]
|
43
47
|
opt.on(
|
@@ -125,9 +129,9 @@ module MiGA::Cli::Action::Wf
|
|
125
129
|
# Download datasets
|
126
130
|
unless cli[:ncbi_taxon].nil?
|
127
131
|
what = cli[:ncbi_draft] ? '--all' : '--complete'
|
128
|
-
|
129
|
-
|
130
|
-
)
|
132
|
+
cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
|
133
|
+
cmd += ['--max', cli[:ncbi_max]] if cli[:ncbi_max]
|
134
|
+
call_cli(cmd)
|
131
135
|
end
|
132
136
|
|
133
137
|
# Add datasets
|
data/lib/miga/common.rb
CHANGED
@@ -53,11 +53,11 @@ class MiGA::MiGA
|
|
53
53
|
# Reports the advance of a task at +step+ (String), the +n+ out of +total+.
|
54
54
|
# The advance is reported in powers of 1,024 if +bin+ is true, or powers of
|
55
55
|
# 1,000 otherwise.
|
56
|
-
# The report goes to $stderr iff --
|
56
|
+
# The report goes to $stderr iff --verbose
|
57
57
|
def advance(step, n = 0, total = nil, bin = true)
|
58
58
|
# Initialize advance timing
|
59
59
|
@_advance_time ||= { last: nil, n: 0, avg: nil }
|
60
|
-
if
|
60
|
+
if @_advance_time[:n] > n
|
61
61
|
@_advance_time[:last] = nil
|
62
62
|
@_advance_time[:n] = 0
|
63
63
|
@_advance_time[:avg] = nil
|
@@ -65,16 +65,17 @@ class MiGA::MiGA
|
|
65
65
|
|
66
66
|
# Estimate timing
|
67
67
|
adv_n = n - @_advance_time[:n]
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
68
|
+
if total.nil? || @_advance_time[:last].nil? || adv_n.negative?
|
69
|
+
@_advance_time[:last] = Time.now
|
70
|
+
@_advance_time[:n] = n
|
71
|
+
elsif adv_n > 0.001 * total
|
72
|
+
this_time = (Time.now - @_advance_time[:last]).to_f
|
73
|
+
this_avg = this_time / adv_n
|
74
|
+
@_advance_time[:avg] ||= this_avg
|
75
|
+
@_advance_time[:avg] = 0.9 * @_advance_time[:avg] + 0.1 * this_avg
|
76
|
+
@_advance_time[:last] = Time.now
|
77
|
+
@_advance_time[:n] = n
|
75
78
|
end
|
76
|
-
@_advance_time[:last] = Time.now
|
77
|
-
@_advance_time[:n] = n
|
78
79
|
|
79
80
|
# Report
|
80
81
|
adv =
|
data/lib/miga/dataset/result.rb
CHANGED
data/lib/miga/version.rb
CHANGED
@@ -12,15 +12,15 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.
|
15
|
+
VERSION = [1.1, 0, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
19
19
|
VERSION_NAME = 'prima'
|
20
20
|
|
21
21
|
##
|
22
|
-
# Date of the current gem
|
23
|
-
VERSION_DATE = Date.new(2021,
|
22
|
+
# Date of the current gem relese.
|
23
|
+
VERSION_DATE = Date.new(2021, 10, 28)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
@@ -36,19 +36,15 @@ HMM.essential.rb \
|
|
36
36
|
> "${DATASET}.ess/log"
|
37
37
|
|
38
38
|
# Index for FastAAI
|
39
|
-
NOMULTI=$(miga
|
39
|
+
NOMULTI=$(miga ls -P "$PROJECT" -D "$DATASET" --no-multi \
|
40
40
|
| wc -l | awk '{print $1}')
|
41
41
|
if [[ "$NOMULTI" -eq "1" ]] ; then
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
--ext ".faix" --index --input-paths --all-vs-all --threads "$CORES"
|
49
|
-
rm "${DATASET}.faix"
|
50
|
-
rm "${DATASET}.faix.hmm"
|
51
|
-
rm "${DATASET}.faix.hmm.filt"
|
42
|
+
echo "$FAA" > "$DATASET"
|
43
|
+
FastAAI build_db --protein_file "$DATASET" \
|
44
|
+
-o "${DATASET}.faix.d" --threads "$CORES"
|
45
|
+
rm "$DATASET"
|
46
|
+
mv "${DATASET}.faix.d/database/FastAAI_database.sqlite.db" "${DATASET}.faix"
|
47
|
+
rm -r "${DATASET}.faix.d"
|
52
48
|
fi
|
53
49
|
|
54
50
|
# Reduce files
|