miga-base 1.0.5.2 → 1.1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/miga-env +6 -0
- data/lib/miga/cli/action/doctor.rb +12 -4
- data/lib/miga/cli/action/env.rb +1 -1
- data/lib/miga/cli/action/init.rb +1 -1
- data/lib/miga/cli/action/ncbi_get/downloads.rb +230 -0
- data/lib/miga/cli/action/ncbi_get.rb +9 -217
- data/lib/miga/cli/action/wf.rb +7 -3
- data/lib/miga/common.rb +12 -11
- data/lib/miga/dataset/result.rb +2 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/essential_genes.bash +7 -11
- data/scripts/miga.bash +2 -2
- data/test/common_test.rb +7 -7
- data/utils/FastAAI/FastAAI +3630 -0
- data/utils/FastAAI/{FastAAI → FastAAI-legacy}/FastAAI +1 -1
- data/utils/FastAAI/{kAAI_v1.0_virus.py → FastAAI-legacy/kAAI_v1.0_virus.py} +0 -0
- data/utils/distance/commands.rb +24 -13
- data/utils/requirements.txt +7 -7
- metadata +7 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4076b3b3a4a4143ac9100ce4d58fada7615f68ad3e6174445510655f62904867
|
4
|
+
data.tar.gz: '0975a5feb4c9eb71a474be87dd14b58297ef1aa7bd8612c20f1ce65febbdf980'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ebcb7fe28d415ca9709433975585518eb1ecd8e8270c584b6579da222e4d3733cc20d810787c3f764f6a6136e1a6f09b7cb6b1c00114c3ea9c0885370654f3a7
|
7
|
+
data.tar.gz: '082bd856ed21487e5de709e2067f1d3453f824e0ece7a77716c6fbe70d88a16c4d295196d5c6133e5667142e25f55f7e48e4a785afd160a14e8195a9b7efa6c2'
|
data/bin/miga-env
ADDED
@@ -46,7 +46,7 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
46
46
|
dist: ['distances', 'Check distance summary tables'],
|
47
47
|
files: ['files', 'Check for outdated files'],
|
48
48
|
cds: ['cds', 'Check for gzipped genes and proteins'],
|
49
|
-
ess: ['essential-genes', 'Check for
|
49
|
+
ess: ['essential-genes', 'Check for outdated essential genes'],
|
50
50
|
mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
|
51
51
|
start: ['start', 'Check for lingering .start files'],
|
52
52
|
tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
|
@@ -252,16 +252,16 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
252
252
|
##
|
253
253
|
# Perform essential-genes operation with MiGA::Cli +cli+
|
254
254
|
def check_ess(cli)
|
255
|
-
cli.say 'Looking for
|
255
|
+
cli.say 'Looking for outdated essential genes'
|
256
256
|
cli.load_project.each_dataset do |d|
|
257
257
|
res = d.result(:essential_genes)
|
258
258
|
next if res.nil?
|
259
259
|
|
260
260
|
dir = res.file_path(:collection)
|
261
|
-
if dir.nil?
|
261
|
+
if dir.nil? || outdated_fastaai_ess(res)
|
262
262
|
cli.say " > Removing #{d.name}:essential_genes"
|
263
263
|
res.remove!
|
264
|
-
|
264
|
+
d.result(:stats)&.remove!
|
265
265
|
next
|
266
266
|
end
|
267
267
|
next if Dir["#{dir}/*.faa"].empty?
|
@@ -272,6 +272,14 @@ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
|
272
272
|
end
|
273
273
|
end
|
274
274
|
|
275
|
+
##
|
276
|
+
# Check if the essential genes result +res+ has an outdated FastAAI index
|
277
|
+
def outdated_fastaai_ess(res)
|
278
|
+
idx1 = res.file_path(:fastaai_index)
|
279
|
+
idx2 = res.file_path(:fastaai_index_2)
|
280
|
+
idx2.nil? && !idx1.nil?
|
281
|
+
end
|
282
|
+
|
275
283
|
##
|
276
284
|
# Perform mytaxa-scan operation with MiGA::Cli +cli+
|
277
285
|
def check_mts(cli)
|
data/lib/miga/cli/action/env.rb
CHANGED
@@ -15,7 +15,7 @@ class MiGA::Cli::Action::Env < MiGA::Cli::Action
|
|
15
15
|
. "$MIGA_HOME/.miga_rc"
|
16
16
|
# Ensure MiGA & submodules are first in PATH
|
17
17
|
export PATH="$MIGA/bin:$PATH"
|
18
|
-
for util in enveomics/Scripts FastAAI/FastAAI multitrim ; do
|
18
|
+
for util in enveomics/Scripts FastAAI/FastAAI FastAAI multitrim ; do
|
19
19
|
export PATH="$MIGA/utils/$util:$PATH"
|
20
20
|
done
|
21
21
|
BASH
|
data/lib/miga/cli/action/init.rb
CHANGED
@@ -0,0 +1,230 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'miga/remote_dataset'
|
4
|
+
require 'csv'
|
5
|
+
|
6
|
+
##
|
7
|
+
# Helper module including download functions for the ncbi_get action
|
8
|
+
module MiGA::Cli::Action::NcbiGet::Downloads
|
9
|
+
def cli_task_flags(opt)
|
10
|
+
cli.opt_flag(
|
11
|
+
opt, 'reference',
|
12
|
+
'Download all reference genomes (ignore any other status)'
|
13
|
+
)
|
14
|
+
cli.opt_flag(opt, 'complete', 'Download complete genomes')
|
15
|
+
cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes')
|
16
|
+
cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds')
|
17
|
+
cli.opt_flag(opt, 'contig', 'Download genomes in contigs')
|
18
|
+
opt.on(
|
19
|
+
'--all',
|
20
|
+
'Download all genomes (in any status)'
|
21
|
+
) do
|
22
|
+
cli[:complete] = true
|
23
|
+
cli[:chromosome] = true
|
24
|
+
cli[:scaffold] = true
|
25
|
+
cli[:contig] = true
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def cli_name_modifiers(opt)
|
30
|
+
opt.on(
|
31
|
+
'--no-version-name',
|
32
|
+
'Do not add sequence version to the dataset name',
|
33
|
+
'Only affects --complete and --chromosome'
|
34
|
+
) { |v| cli[:add_version] = v }
|
35
|
+
cli.opt_flag(
|
36
|
+
opt, 'legacy-name',
|
37
|
+
'Use dataset names based on chromosome entries instead of assembly',
|
38
|
+
:legacy_name
|
39
|
+
)
|
40
|
+
end
|
41
|
+
|
42
|
+
def cli_filters(opt)
|
43
|
+
opt.on(
|
44
|
+
'--blacklist PATH',
|
45
|
+
'A file with dataset names to blacklist'
|
46
|
+
) { |v| cli[:blacklist] = v }
|
47
|
+
cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
|
48
|
+
opt.on(
|
49
|
+
'--ignore-until STRING',
|
50
|
+
'Ignores all datasets until a name is found (useful for large reruns)'
|
51
|
+
) { |v| cli[:ignore_until] = v }
|
52
|
+
cli.opt_flag(
|
53
|
+
opt, 'get-metadata',
|
54
|
+
'Only download and update metadata for existing datasets', :get_md
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
def cli_save_actions(opt)
|
59
|
+
cli.opt_flag(
|
60
|
+
opt, 'only-metadata',
|
61
|
+
'Create datasets without input data but retrieve all metadata',
|
62
|
+
:only_md
|
63
|
+
)
|
64
|
+
opt.on(
|
65
|
+
'--save-every INT', Integer,
|
66
|
+
'Save project every this many downloaded datasets',
|
67
|
+
'If zero, it saves the project only once upon completion',
|
68
|
+
"By default: #{cli[:save_every]}"
|
69
|
+
) { |v| cli[:save_every] = v }
|
70
|
+
opt.on(
|
71
|
+
'-q', '--query',
|
72
|
+
'Register the datasets as queries, not reference datasets'
|
73
|
+
) { |v| cli[:query] = v }
|
74
|
+
opt.on(
|
75
|
+
'-u', '--unlink',
|
76
|
+
'Unlink all datasets in the project missing from the download list'
|
77
|
+
) { |v| cli[:unlink] = v }
|
78
|
+
opt.on(
|
79
|
+
'-R', '--remote-list PATH',
|
80
|
+
'Path to an output file with the list of all datasets listed remotely'
|
81
|
+
) { |v| cli[:remote_list] = v }
|
82
|
+
end
|
83
|
+
|
84
|
+
def sanitize_cli
|
85
|
+
cli.ensure_par(taxon: '-T')
|
86
|
+
tasks = %w[reference complete chromosome scaffold contig]
|
87
|
+
unless tasks.any? { |i| cli[i.to_sym] }
|
88
|
+
raise 'No action requested: pick at least one type of genome'
|
89
|
+
end
|
90
|
+
|
91
|
+
cli[:save_every] = 1 if cli[:dry]
|
92
|
+
end
|
93
|
+
|
94
|
+
def remote_list
|
95
|
+
cli.say 'Downloading genome list'
|
96
|
+
ds = {}
|
97
|
+
url = remote_list_url
|
98
|
+
doc = MiGA::RemoteDataset.download_url(url)
|
99
|
+
CSV.parse(doc, headers: true).each do |r|
|
100
|
+
asm = r['assembly']
|
101
|
+
next if asm.nil? || asm.empty? || asm == '-'
|
102
|
+
next unless r['ftp_path_genbank']
|
103
|
+
|
104
|
+
rep = remote_row_replicons(r)
|
105
|
+
n = remote_row_name(r, rep, asm)
|
106
|
+
|
107
|
+
# Register for download
|
108
|
+
fna_url = '%s/%s_genomic.fna.gz' %
|
109
|
+
[r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])]
|
110
|
+
ds[n] = {
|
111
|
+
ids: [fna_url], db: :assembly_gz, universe: :web,
|
112
|
+
md: {
|
113
|
+
type: :genome, ncbi_asm: asm, strain: r['strain']
|
114
|
+
}
|
115
|
+
}
|
116
|
+
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
117
|
+
unless r['release_date'].nil?
|
118
|
+
ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
|
119
|
+
end
|
120
|
+
end
|
121
|
+
ds
|
122
|
+
end
|
123
|
+
|
124
|
+
def remote_row_replicons(r)
|
125
|
+
return if r['replicons'].nil?
|
126
|
+
|
127
|
+
r['replicons']
|
128
|
+
.split('; ')
|
129
|
+
.map { |i| i.gsub(/.*:/, '') }
|
130
|
+
.map { |i| i.gsub(%r{/.*}, '') }
|
131
|
+
end
|
132
|
+
|
133
|
+
def remote_row_name(r, rep, asm)
|
134
|
+
return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
|
135
|
+
|
136
|
+
if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
|
137
|
+
acc = rep.nil? ? '' : rep.first
|
138
|
+
else
|
139
|
+
acc = asm
|
140
|
+
end
|
141
|
+
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
142
|
+
"#{r['#organism']}_#{acc}".miga_name
|
143
|
+
end
|
144
|
+
|
145
|
+
def remote_list_url
|
146
|
+
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
|
147
|
+
url_param = {
|
148
|
+
q: '[display()].' \
|
149
|
+
'from(GenomeAssemblies).' \
|
150
|
+
'usingschema(/schema/GenomeAssemblies).' \
|
151
|
+
'matching(tab==["Prokaryotes"] and q=="' \
|
152
|
+
"#{cli[:taxon]&.tr('"', "'")}\"",
|
153
|
+
fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
|
154
|
+
'level|level,ftp_path_genbank|ftp_path_genbank,' \
|
155
|
+
'release_date|release_date,strain|strain',
|
156
|
+
nolimit: 'on'
|
157
|
+
}
|
158
|
+
if cli[:reference]
|
159
|
+
url_param[:q] += ' and refseq_category==["representative"]'
|
160
|
+
else
|
161
|
+
status = {
|
162
|
+
complete: 'Complete',
|
163
|
+
chromosome: ' Chromosome', # <- The leading space is *VERY* important!
|
164
|
+
scaffold: 'Scaffold',
|
165
|
+
contig: 'Contig'
|
166
|
+
}.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
|
167
|
+
url_param[:q] += ' and level==[' + status + ']'
|
168
|
+
end
|
169
|
+
url_param[:q] += ')'
|
170
|
+
url_base + URI.encode_www_form(url_param)
|
171
|
+
end
|
172
|
+
|
173
|
+
def discard_blacklisted(ds)
|
174
|
+
unless cli[:blacklist].nil?
|
175
|
+
cli.say "Discarding datasets in #{cli[:blacklist]}"
|
176
|
+
File.readlines(cli[:blacklist])
|
177
|
+
.select { |i| i !~ /^#/ }
|
178
|
+
.map(&:chomp)
|
179
|
+
.each { |i| ds.delete i }
|
180
|
+
end
|
181
|
+
ds
|
182
|
+
end
|
183
|
+
|
184
|
+
def impose_limit(ds)
|
185
|
+
max = cli[:max_datasets].to_i
|
186
|
+
if !max.zero? && max < ds.size
|
187
|
+
cli.say "Subsampling list from #{ds.size} to #{max} datasets"
|
188
|
+
sample = ds.keys.sample(max)
|
189
|
+
ds.select! { |k, _| sample.include? k }
|
190
|
+
end
|
191
|
+
ds
|
192
|
+
end
|
193
|
+
|
194
|
+
def download_entries(ds, p)
|
195
|
+
cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
|
196
|
+
p.do_not_save = true if cli[:save_every] != 1
|
197
|
+
ignore = !cli[:ignore_until].nil?
|
198
|
+
downloaded = 0
|
199
|
+
d = []
|
200
|
+
ds.each do |name, body|
|
201
|
+
d << name
|
202
|
+
cli.puts name
|
203
|
+
ignore = false if ignore && name == cli[:ignore_until]
|
204
|
+
next if ignore || p.dataset(name).nil? == cli[:get_md]
|
205
|
+
|
206
|
+
downloaded += 1
|
207
|
+
unless cli[:dry]
|
208
|
+
save_entry(name, body, p)
|
209
|
+
p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
|
210
|
+
end
|
211
|
+
end
|
212
|
+
p.do_not_save = false
|
213
|
+
p.save! if cli[:save_every] != 1
|
214
|
+
[d, downloaded]
|
215
|
+
end
|
216
|
+
|
217
|
+
def save_entry(name, body, p)
|
218
|
+
cli.say ' Locating remote dataset'
|
219
|
+
body[:md][:metadata_only] = true if cli[:only_md]
|
220
|
+
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
221
|
+
if cli[:get_md]
|
222
|
+
cli.say ' Updating dataset'
|
223
|
+
rd.update_metadata(p.dataset(name), body[:md])
|
224
|
+
else
|
225
|
+
cli.say ' Creating dataset'
|
226
|
+
rd.save_to(p, name, !cli[:query], body[:md])
|
227
|
+
cli.add_metadata(p.add_dataset(name))
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
@@ -1,11 +1,11 @@
|
|
1
|
-
#
|
2
|
-
# @license Artistic-2.0
|
1
|
+
# frozen_string_literal: true
|
3
2
|
|
4
3
|
require 'miga/cli/action'
|
5
|
-
require 'miga/remote_dataset'
|
6
|
-
require 'csv'
|
7
4
|
|
8
5
|
class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
6
|
+
require 'miga/cli/action/ncbi_get/downloads'
|
7
|
+
include MiGA::Cli::Action::NcbiGet::Downloads
|
8
|
+
|
9
9
|
def parse_cli
|
10
10
|
cli.defaults = {
|
11
11
|
query: false, unlink: false,
|
@@ -20,6 +20,10 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
20
20
|
'-T', '--taxon STRING',
|
21
21
|
'(Mandatory) Taxon name (e.g., a species binomial)'
|
22
22
|
) { |v| cli[:taxon] = v }
|
23
|
+
opt.on(
|
24
|
+
'--max INT', Integer,
|
25
|
+
'Maximum number of datasets to download (by default: unlimited)'
|
26
|
+
) { |v| cli[:max_datasets] = v }
|
23
27
|
opt.on(
|
24
28
|
'-m', '--metadata STRING',
|
25
29
|
'Metadata as key-value pairs separated by = and delimited by comma',
|
@@ -41,6 +45,7 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
41
45
|
p = cli.load_project
|
42
46
|
ds = remote_list
|
43
47
|
ds = discard_blacklisted(ds)
|
48
|
+
ds = impose_limit(ds)
|
44
49
|
d, downloaded = download_entries(ds, p)
|
45
50
|
|
46
51
|
# Finalize
|
@@ -59,217 +64,4 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
59
64
|
cli.say "Datasets unlinked: #{unlink.size}"
|
60
65
|
end
|
61
66
|
|
62
|
-
private
|
63
|
-
|
64
|
-
def cli_task_flags(opt)
|
65
|
-
cli.opt_flag(
|
66
|
-
opt, 'reference',
|
67
|
-
'Download all reference genomes (ignore any other status)'
|
68
|
-
)
|
69
|
-
cli.opt_flag(opt, 'complete', 'Download complete genomes')
|
70
|
-
cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes')
|
71
|
-
cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds')
|
72
|
-
cli.opt_flag(opt, 'contig', 'Download genomes in contigs')
|
73
|
-
opt.on(
|
74
|
-
'--all',
|
75
|
-
'Download all genomes (in any status)'
|
76
|
-
) do
|
77
|
-
cli[:complete] = true
|
78
|
-
cli[:chromosome] = true
|
79
|
-
cli[:scaffold] = true
|
80
|
-
cli[:contig] = true
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
def cli_name_modifiers(opt)
|
85
|
-
opt.on(
|
86
|
-
'--no-version-name',
|
87
|
-
'Do not add sequence version to the dataset name',
|
88
|
-
'Only affects --complete and --chromosome'
|
89
|
-
) { |v| cli[:add_version] = v }
|
90
|
-
cli.opt_flag(
|
91
|
-
opt, 'legacy-name',
|
92
|
-
'Use dataset names based on chromosome entries instead of assembly',
|
93
|
-
:legacy_name
|
94
|
-
)
|
95
|
-
end
|
96
|
-
|
97
|
-
def cli_filters(opt)
|
98
|
-
opt.on(
|
99
|
-
'--blacklist PATH',
|
100
|
-
'A file with dataset names to blacklist'
|
101
|
-
) { |v| cli[:blacklist] = v }
|
102
|
-
cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
|
103
|
-
opt.on(
|
104
|
-
'--ignore-until STRING',
|
105
|
-
'Ignores all datasets until a name is found (useful for large reruns)'
|
106
|
-
) { |v| cli[:ignore_until] = v }
|
107
|
-
cli.opt_flag(
|
108
|
-
opt, 'get-metadata',
|
109
|
-
'Only download and update metadata for existing datasets', :get_md
|
110
|
-
)
|
111
|
-
end
|
112
|
-
|
113
|
-
def cli_save_actions(opt)
|
114
|
-
cli.opt_flag(
|
115
|
-
opt, 'only-metadata',
|
116
|
-
'Create datasets without input data but retrieve all metadata',
|
117
|
-
:only_md
|
118
|
-
)
|
119
|
-
opt.on(
|
120
|
-
'--save-every INT', Integer,
|
121
|
-
'Save project every this many downloaded datasets',
|
122
|
-
'If zero, it saves the project only once upon completion',
|
123
|
-
"By default: #{cli[:save_every]}"
|
124
|
-
) { |v| cli[:save_every] = v }
|
125
|
-
opt.on(
|
126
|
-
'-q', '--query',
|
127
|
-
'Register the datasets as queries, not reference datasets'
|
128
|
-
) { |v| cli[:query] = v }
|
129
|
-
opt.on(
|
130
|
-
'-u', '--unlink',
|
131
|
-
'Unlink all datasets in the project missing from the download list'
|
132
|
-
) { |v| cli[:unlink] = v }
|
133
|
-
opt.on(
|
134
|
-
'-R', '--remote-list PATH',
|
135
|
-
'Path to an output file with the list of all datasets listed remotely'
|
136
|
-
) { |v| cli[:remote_list] = v }
|
137
|
-
end
|
138
|
-
|
139
|
-
def sanitize_cli
|
140
|
-
cli.ensure_par(taxon: '-T')
|
141
|
-
tasks = %w[reference complete chromosome scaffold contig]
|
142
|
-
unless tasks.any? { |i| cli[i.to_sym] }
|
143
|
-
raise 'No action requested: pick at least one type of genome'
|
144
|
-
end
|
145
|
-
|
146
|
-
cli[:save_every] = 1 if cli[:dry]
|
147
|
-
end
|
148
|
-
|
149
|
-
def remote_list
|
150
|
-
cli.say 'Downloading genome list'
|
151
|
-
ds = {}
|
152
|
-
url = remote_list_url
|
153
|
-
doc = RemoteDataset.download_url(url)
|
154
|
-
CSV.parse(doc, headers: true).each do |r|
|
155
|
-
asm = r['assembly']
|
156
|
-
next if asm.nil? || asm.empty? || asm == '-'
|
157
|
-
next unless r['ftp_path_genbank']
|
158
|
-
|
159
|
-
rep = remote_row_replicons(r)
|
160
|
-
n = remote_row_name(r, rep, asm)
|
161
|
-
|
162
|
-
# Register for download
|
163
|
-
fna_url = '%s/%s_genomic.fna.gz' %
|
164
|
-
[r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])]
|
165
|
-
ds[n] = {
|
166
|
-
ids: [fna_url], db: :assembly_gz, universe: :web,
|
167
|
-
md: {
|
168
|
-
type: :genome, ncbi_asm: asm, strain: r['strain']
|
169
|
-
}
|
170
|
-
}
|
171
|
-
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
172
|
-
unless r['release_date'].nil?
|
173
|
-
ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
|
174
|
-
end
|
175
|
-
end
|
176
|
-
ds
|
177
|
-
end
|
178
|
-
|
179
|
-
def remote_row_replicons(r)
|
180
|
-
return if r['replicons'].nil?
|
181
|
-
|
182
|
-
r['replicons']
|
183
|
-
.split('; ')
|
184
|
-
.map { |i| i.gsub(/.*:/, '') }
|
185
|
-
.map { |i| i.gsub(%r{/.*}, '') }
|
186
|
-
end
|
187
|
-
|
188
|
-
def remote_row_name(r, rep, asm)
|
189
|
-
return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
|
190
|
-
|
191
|
-
if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
|
192
|
-
acc = rep.nil? ? '' : rep.first
|
193
|
-
else
|
194
|
-
acc = asm
|
195
|
-
end
|
196
|
-
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
197
|
-
"#{r['#organism']}_#{acc}".miga_name
|
198
|
-
end
|
199
|
-
|
200
|
-
def remote_list_url
|
201
|
-
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
|
202
|
-
url_param = {
|
203
|
-
q: '[display()].' \
|
204
|
-
'from(GenomeAssemblies).' \
|
205
|
-
'usingschema(/schema/GenomeAssemblies).' \
|
206
|
-
'matching(tab==["Prokaryotes"] and q=="' \
|
207
|
-
"#{cli[:taxon]&.tr('"', "'")}\"",
|
208
|
-
fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
|
209
|
-
'level|level,ftp_path_genbank|ftp_path_genbank,' \
|
210
|
-
'release_date|release_date,strain|strain',
|
211
|
-
nolimit: 'on'
|
212
|
-
}
|
213
|
-
if cli[:reference]
|
214
|
-
url_param[:q] += ' and refseq_category==["representative"]'
|
215
|
-
else
|
216
|
-
status = {
|
217
|
-
complete: 'Complete',
|
218
|
-
chromosome: ' Chromosome', # <- The leading space is *VERY* important!
|
219
|
-
scaffold: 'Scaffold',
|
220
|
-
contig: 'Contig'
|
221
|
-
}.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
|
222
|
-
url_param[:q] += ' and level==[' + status + ']'
|
223
|
-
end
|
224
|
-
url_param[:q] += ')'
|
225
|
-
url_base + URI.encode_www_form(url_param)
|
226
|
-
end
|
227
|
-
|
228
|
-
def discard_blacklisted(ds)
|
229
|
-
unless cli[:blacklist].nil?
|
230
|
-
cli.say "Discarding datasets in #{cli[:blacklist]}"
|
231
|
-
File.readlines(cli[:blacklist])
|
232
|
-
.select { |i| i !~ /^#/ }
|
233
|
-
.map(&:chomp)
|
234
|
-
.each { |i| ds.delete i }
|
235
|
-
end
|
236
|
-
ds
|
237
|
-
end
|
238
|
-
|
239
|
-
def download_entries(ds, p)
|
240
|
-
cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
|
241
|
-
p.do_not_save = true if cli[:save_every] != 1
|
242
|
-
ignore = !cli[:ignore_until].nil?
|
243
|
-
downloaded = 0
|
244
|
-
d = []
|
245
|
-
ds.each do |name, body|
|
246
|
-
d << name
|
247
|
-
cli.puts name
|
248
|
-
ignore = false if ignore && name == cli[:ignore_until]
|
249
|
-
next if ignore || p.dataset(name).nil? == cli[:get_md]
|
250
|
-
|
251
|
-
downloaded += 1
|
252
|
-
unless cli[:dry]
|
253
|
-
save_entry(name, body, p)
|
254
|
-
p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
|
255
|
-
end
|
256
|
-
end
|
257
|
-
p.do_not_save = false
|
258
|
-
p.save! if cli[:save_every] != 1
|
259
|
-
[d, downloaded]
|
260
|
-
end
|
261
|
-
|
262
|
-
def save_entry(name, body, p)
|
263
|
-
cli.say ' Locating remote dataset'
|
264
|
-
body[:md][:metadata_only] = true if cli[:only_md]
|
265
|
-
rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
266
|
-
if cli[:get_md]
|
267
|
-
cli.say ' Updating dataset'
|
268
|
-
rd.update_metadata(p.dataset(name), body[:md])
|
269
|
-
else
|
270
|
-
cli.say ' Creating dataset'
|
271
|
-
rd.save_to(p, name, !cli[:query], body[:md])
|
272
|
-
cli.add_metadata(p.add_dataset(name))
|
273
|
-
end
|
274
|
-
end
|
275
67
|
end
|
data/lib/miga/cli/action/wf.rb
CHANGED
@@ -38,6 +38,10 @@ module MiGA::Cli::Action::Wf
|
|
38
38
|
'--no-draft',
|
39
39
|
'Only download complete genomes, not drafts'
|
40
40
|
) { |v| cli[:ncbi_draft] = v }
|
41
|
+
opt.on(
|
42
|
+
'--max-download INT', Integer,
|
43
|
+
'Maximum number of genomes to download (by default: unlimited)'
|
44
|
+
) { |v| cli[:ncbi_max] = v }
|
41
45
|
end
|
42
46
|
if params[:qual]
|
43
47
|
opt.on(
|
@@ -125,9 +129,9 @@ module MiGA::Cli::Action::Wf
|
|
125
129
|
# Download datasets
|
126
130
|
unless cli[:ncbi_taxon].nil?
|
127
131
|
what = cli[:ncbi_draft] ? '--all' : '--complete'
|
128
|
-
|
129
|
-
|
130
|
-
)
|
132
|
+
cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
|
133
|
+
cmd += ['--max', cli[:ncbi_max]] if cli[:ncbi_max]
|
134
|
+
call_cli(cmd)
|
131
135
|
end
|
132
136
|
|
133
137
|
# Add datasets
|
data/lib/miga/common.rb
CHANGED
@@ -53,11 +53,11 @@ class MiGA::MiGA
|
|
53
53
|
# Reports the advance of a task at +step+ (String), the +n+ out of +total+.
|
54
54
|
# The advance is reported in powers of 1,024 if +bin+ is true, or powers of
|
55
55
|
# 1,000 otherwise.
|
56
|
-
# The report goes to $stderr iff --
|
56
|
+
# The report goes to $stderr iff --verbose
|
57
57
|
def advance(step, n = 0, total = nil, bin = true)
|
58
58
|
# Initialize advance timing
|
59
59
|
@_advance_time ||= { last: nil, n: 0, avg: nil }
|
60
|
-
if
|
60
|
+
if @_advance_time[:n] > n
|
61
61
|
@_advance_time[:last] = nil
|
62
62
|
@_advance_time[:n] = 0
|
63
63
|
@_advance_time[:avg] = nil
|
@@ -65,16 +65,17 @@ class MiGA::MiGA
|
|
65
65
|
|
66
66
|
# Estimate timing
|
67
67
|
adv_n = n - @_advance_time[:n]
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
68
|
+
if total.nil? || @_advance_time[:last].nil? || adv_n.negative?
|
69
|
+
@_advance_time[:last] = Time.now
|
70
|
+
@_advance_time[:n] = n
|
71
|
+
elsif adv_n > 0.001 * total
|
72
|
+
this_time = (Time.now - @_advance_time[:last]).to_f
|
73
|
+
this_avg = this_time / adv_n
|
74
|
+
@_advance_time[:avg] ||= this_avg
|
75
|
+
@_advance_time[:avg] = 0.9 * @_advance_time[:avg] + 0.1 * this_avg
|
76
|
+
@_advance_time[:last] = Time.now
|
77
|
+
@_advance_time[:n] = n
|
75
78
|
end
|
76
|
-
@_advance_time[:last] = Time.now
|
77
|
-
@_advance_time[:n] = n
|
78
79
|
|
79
80
|
# Report
|
80
81
|
adv =
|
data/lib/miga/dataset/result.rb
CHANGED
data/lib/miga/version.rb
CHANGED
@@ -12,15 +12,15 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.
|
15
|
+
VERSION = [1.1, 0, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
19
19
|
VERSION_NAME = 'prima'
|
20
20
|
|
21
21
|
##
|
22
|
-
# Date of the current gem
|
23
|
-
VERSION_DATE = Date.new(2021,
|
22
|
+
# Date of the current gem relese.
|
23
|
+
VERSION_DATE = Date.new(2021, 10, 28)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|