miga-base 1.2.10.2 → 1.2.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/download/base.rb +109 -0
- data/lib/miga/cli/action/download/gtdb.rb +55 -0
- data/lib/miga/cli/action/{ncbi_get/downloads.rb → download/ncbi.rb} +4 -102
- data/lib/miga/cli/action/get.rb +2 -1
- data/lib/miga/cli/action/gtdb_get.rb +61 -0
- data/lib/miga/cli/action/ncbi_get.rb +2 -2
- data/lib/miga/cli/base.rb +1 -0
- data/lib/miga/dataset/result.rb +4 -1
- data/lib/miga/remote_dataset/base.rb +20 -1
- data/lib/miga/remote_dataset.rb +29 -5
- data/lib/miga/taxonomy.rb +2 -0
- data/lib/miga/version.rb +2 -2
- data/scripts/stats.bash +26 -1
- data/utils/requirements.txt +1 -0
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 44cd40923e839969a12a0111a071a405d8cab376a0fa6e7102a21cafd33774a9
|
4
|
+
data.tar.gz: 5d1d0af6324b8c125a178b316a54e90eff2472aa7731407fc4d315a8b3fe9a36
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8f70cb028127bd30f5293005afaf73cf10c02a003cef1e027d85b3c6a2f61377e95be5962c03f6dd199f63da252160b21b7142c135a305aacb7f46b6551c90cf
|
7
|
+
data.tar.gz: 50a8676d741894e58a1b4c9b5f53ee665ff938e2814bf4570e204f26d38de7952aab1ee36c01a458021af3bba166432572d79f84f1b5fb430684bf9034506e97
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'miga/remote_dataset'
|
4
|
+
module MiGA::Cli::Action::Download
|
5
|
+
end
|
6
|
+
|
7
|
+
##
|
8
|
+
# Helper module including download functions for the *_get actions
|
9
|
+
module MiGA::Cli::Action::Download::Base
|
10
|
+
def cli_filters(opt)
|
11
|
+
opt.on(
|
12
|
+
'--blacklist PATH',
|
13
|
+
'A file with dataset names to blacklist'
|
14
|
+
) { |v| cli[:blacklist] = v }
|
15
|
+
cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
|
16
|
+
opt.on(
|
17
|
+
'--ignore-until STRING',
|
18
|
+
'Ignores all datasets until a name is found (useful for large reruns)'
|
19
|
+
) { |v| cli[:ignore_until] = v }
|
20
|
+
cli.opt_flag(
|
21
|
+
opt, 'get-metadata',
|
22
|
+
'Only download and update metadata for existing datasets', :get_md
|
23
|
+
)
|
24
|
+
end
|
25
|
+
|
26
|
+
def cli_save_actions(opt)
|
27
|
+
cli.opt_flag(
|
28
|
+
opt, '--only-metadata',
|
29
|
+
'Create datasets without input data but retrieve all metadata',
|
30
|
+
:only_md
|
31
|
+
)
|
32
|
+
opt.on(
|
33
|
+
'--save-every INT', Integer,
|
34
|
+
'Save project every this many downloaded datasets',
|
35
|
+
'If zero, it saves the project only once upon completion',
|
36
|
+
"By default: #{cli[:save_every]}"
|
37
|
+
) { |v| cli[:save_every] = v }
|
38
|
+
opt.on(
|
39
|
+
'-q', '--query',
|
40
|
+
'Register the datasets as queries, not reference datasets'
|
41
|
+
) { |v| cli[:query] = v }
|
42
|
+
opt.on(
|
43
|
+
'-u', '--unlink',
|
44
|
+
'Unlink all datasets in the project missing from the download list'
|
45
|
+
) { |v| cli[:unlink] = v }
|
46
|
+
opt.on(
|
47
|
+
'-R', '--remote-list PATH',
|
48
|
+
'Path to an output file with the list of all datasets listed remotely'
|
49
|
+
) { |v| cli[:remote_list] = v }
|
50
|
+
end
|
51
|
+
|
52
|
+
def discard_blacklisted(ds)
|
53
|
+
unless cli[:blacklist].nil?
|
54
|
+
cli.say "Discarding datasets in #{cli[:blacklist]}"
|
55
|
+
File.readlines(cli[:blacklist])
|
56
|
+
.select { |i| i !~ /^#/ }
|
57
|
+
.map(&:chomp)
|
58
|
+
.each { |i| ds.delete i }
|
59
|
+
end
|
60
|
+
ds
|
61
|
+
end
|
62
|
+
|
63
|
+
def impose_limit(ds)
|
64
|
+
max = cli[:max_datasets].to_i
|
65
|
+
if !max.zero? && max < ds.size
|
66
|
+
cli.say "Subsampling list from #{ds.size} to #{max} datasets"
|
67
|
+
sample = ds.keys.sample(max)
|
68
|
+
ds.select! { |k, _| sample.include? k }
|
69
|
+
end
|
70
|
+
ds
|
71
|
+
end
|
72
|
+
|
73
|
+
def download_entries(ds, p)
|
74
|
+
cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
|
75
|
+
p.do_not_save = true if cli[:save_every] != 1
|
76
|
+
ignore = !cli[:ignore_until].nil?
|
77
|
+
downloaded = 0
|
78
|
+
d = []
|
79
|
+
ds.each do |name, body|
|
80
|
+
d << name
|
81
|
+
cli.puts name
|
82
|
+
ignore = false if ignore && name == cli[:ignore_until]
|
83
|
+
next if ignore || p.dataset(name).nil? == cli[:get_md]
|
84
|
+
|
85
|
+
downloaded += 1
|
86
|
+
unless cli[:dry]
|
87
|
+
save_entry(name, body, p)
|
88
|
+
p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
|
89
|
+
end
|
90
|
+
end
|
91
|
+
p.do_not_save = false
|
92
|
+
p.save! if cli[:save_every] != 1
|
93
|
+
[d, downloaded]
|
94
|
+
end
|
95
|
+
|
96
|
+
def save_entry(name, body, p)
|
97
|
+
cli.say ' Locating remote dataset'
|
98
|
+
body[:md][:metadata_only] = true if cli[:only_md]
|
99
|
+
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
100
|
+
if cli[:get_md]
|
101
|
+
cli.say ' Updating dataset'
|
102
|
+
rd.update_metadata(p.dataset(name), body[:md])
|
103
|
+
else
|
104
|
+
cli.say ' Creating dataset'
|
105
|
+
rd.save_to(p, name, !cli[:query], body[:md])
|
106
|
+
cli.add_metadata(p.add_dataset(name))
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'miga/cli/action/download/base'
|
4
|
+
|
5
|
+
##
|
6
|
+
# Helper module including download functions for the gtdb_get action
|
7
|
+
module MiGA::Cli::Action::Download::Gtdb
|
8
|
+
include MiGA::Cli::Action::Download::Base
|
9
|
+
|
10
|
+
def cli_task_flags(opt)
|
11
|
+
cli.opt_flag(
|
12
|
+
opt, 'reference',
|
13
|
+
'Download only reference genomes. By default: download all'
|
14
|
+
)
|
15
|
+
end
|
16
|
+
|
17
|
+
def cli_name_modifiers(opt)
|
18
|
+
opt.on(
|
19
|
+
'--no-version-name',
|
20
|
+
'Do not add sequence version to the dataset name'
|
21
|
+
) { |v| cli[:add_version] = v }
|
22
|
+
end
|
23
|
+
|
24
|
+
def sanitize_cli
|
25
|
+
cli.ensure_par(taxon: '-T')
|
26
|
+
cli[:save_every] = 1 if cli[:dry]
|
27
|
+
end
|
28
|
+
|
29
|
+
def remote_list
|
30
|
+
cli.say 'Downloading genome list'
|
31
|
+
extra = ['sp_reps_only=' + cli[:reference].to_s]
|
32
|
+
json = MiGA::RemoteDataset.download(
|
33
|
+
:gtdb, :taxon, cli[:taxon], :genomes, nil, extra
|
34
|
+
)
|
35
|
+
doc = MiGA::Json.parse(json, contents: true)
|
36
|
+
|
37
|
+
Hash[
|
38
|
+
doc.map do |acc|
|
39
|
+
[
|
40
|
+
remote_row_name(acc),
|
41
|
+
{
|
42
|
+
ids: [acc], db: :assembly, universe: :gtdb,
|
43
|
+
md: { type: :genome, gtdb_assembly: acc }
|
44
|
+
}
|
45
|
+
]
|
46
|
+
end
|
47
|
+
]
|
48
|
+
end
|
49
|
+
|
50
|
+
def remote_row_name(asm)
|
51
|
+
acc = asm.to_s
|
52
|
+
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
53
|
+
acc.miga_name
|
54
|
+
end
|
55
|
+
end
|
@@ -1,11 +1,13 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require 'miga/
|
3
|
+
require 'miga/cli/action/download/base'
|
4
4
|
require 'csv'
|
5
5
|
|
6
6
|
##
|
7
7
|
# Helper module including download functions for the ncbi_get action
|
8
|
-
module MiGA::Cli::Action::
|
8
|
+
module MiGA::Cli::Action::Download::Ncbi
|
9
|
+
include MiGA::Cli::Action::Download::Base
|
10
|
+
|
9
11
|
def cli_task_flags(opt)
|
10
12
|
cli.opt_flag(
|
11
13
|
opt, 'reference',
|
@@ -39,48 +41,6 @@ module MiGA::Cli::Action::NcbiGet::Downloads
|
|
39
41
|
)
|
40
42
|
end
|
41
43
|
|
42
|
-
def cli_filters(opt)
|
43
|
-
opt.on(
|
44
|
-
'--blacklist PATH',
|
45
|
-
'A file with dataset names to blacklist'
|
46
|
-
) { |v| cli[:blacklist] = v }
|
47
|
-
cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
|
48
|
-
opt.on(
|
49
|
-
'--ignore-until STRING',
|
50
|
-
'Ignores all datasets until a name is found (useful for large reruns)'
|
51
|
-
) { |v| cli[:ignore_until] = v }
|
52
|
-
cli.opt_flag(
|
53
|
-
opt, 'get-metadata',
|
54
|
-
'Only download and update metadata for existing datasets', :get_md
|
55
|
-
)
|
56
|
-
end
|
57
|
-
|
58
|
-
def cli_save_actions(opt)
|
59
|
-
cli.opt_flag(
|
60
|
-
opt, 'only-metadata',
|
61
|
-
'Create datasets without input data but retrieve all metadata',
|
62
|
-
:only_md
|
63
|
-
)
|
64
|
-
opt.on(
|
65
|
-
'--save-every INT', Integer,
|
66
|
-
'Save project every this many downloaded datasets',
|
67
|
-
'If zero, it saves the project only once upon completion',
|
68
|
-
"By default: #{cli[:save_every]}"
|
69
|
-
) { |v| cli[:save_every] = v }
|
70
|
-
opt.on(
|
71
|
-
'-q', '--query',
|
72
|
-
'Register the datasets as queries, not reference datasets'
|
73
|
-
) { |v| cli[:query] = v }
|
74
|
-
opt.on(
|
75
|
-
'-u', '--unlink',
|
76
|
-
'Unlink all datasets in the project missing from the download list'
|
77
|
-
) { |v| cli[:unlink] = v }
|
78
|
-
opt.on(
|
79
|
-
'-R', '--remote-list PATH',
|
80
|
-
'Path to an output file with the list of all datasets listed remotely'
|
81
|
-
) { |v| cli[:remote_list] = v }
|
82
|
-
end
|
83
|
-
|
84
44
|
def sanitize_cli
|
85
45
|
cli.ensure_par(taxon: '-T')
|
86
46
|
tasks = %w[reference complete chromosome scaffold contig]
|
@@ -169,62 +129,4 @@ module MiGA::Cli::Action::NcbiGet::Downloads
|
|
169
129
|
url_param[:q] += ')'
|
170
130
|
url_base + URI.encode_www_form(url_param)
|
171
131
|
end
|
172
|
-
|
173
|
-
def discard_blacklisted(ds)
|
174
|
-
unless cli[:blacklist].nil?
|
175
|
-
cli.say "Discarding datasets in #{cli[:blacklist]}"
|
176
|
-
File.readlines(cli[:blacklist])
|
177
|
-
.select { |i| i !~ /^#/ }
|
178
|
-
.map(&:chomp)
|
179
|
-
.each { |i| ds.delete i }
|
180
|
-
end
|
181
|
-
ds
|
182
|
-
end
|
183
|
-
|
184
|
-
def impose_limit(ds)
|
185
|
-
max = cli[:max_datasets].to_i
|
186
|
-
if !max.zero? && max < ds.size
|
187
|
-
cli.say "Subsampling list from #{ds.size} to #{max} datasets"
|
188
|
-
sample = ds.keys.sample(max)
|
189
|
-
ds.select! { |k, _| sample.include? k }
|
190
|
-
end
|
191
|
-
ds
|
192
|
-
end
|
193
|
-
|
194
|
-
def download_entries(ds, p)
|
195
|
-
cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
|
196
|
-
p.do_not_save = true if cli[:save_every] != 1
|
197
|
-
ignore = !cli[:ignore_until].nil?
|
198
|
-
downloaded = 0
|
199
|
-
d = []
|
200
|
-
ds.each do |name, body|
|
201
|
-
d << name
|
202
|
-
cli.puts name
|
203
|
-
ignore = false if ignore && name == cli[:ignore_until]
|
204
|
-
next if ignore || p.dataset(name).nil? == cli[:get_md]
|
205
|
-
|
206
|
-
downloaded += 1
|
207
|
-
unless cli[:dry]
|
208
|
-
save_entry(name, body, p)
|
209
|
-
p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
|
210
|
-
end
|
211
|
-
end
|
212
|
-
p.do_not_save = false
|
213
|
-
p.save! if cli[:save_every] != 1
|
214
|
-
[d, downloaded]
|
215
|
-
end
|
216
|
-
|
217
|
-
def save_entry(name, body, p)
|
218
|
-
cli.say ' Locating remote dataset'
|
219
|
-
body[:md][:metadata_only] = true if cli[:only_md]
|
220
|
-
rd = MiGA::RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
221
|
-
if cli[:get_md]
|
222
|
-
cli.say ' Updating dataset'
|
223
|
-
rd.update_metadata(p.dataset(name), body[:md])
|
224
|
-
else
|
225
|
-
cli.say ' Creating dataset'
|
226
|
-
rd.save_to(p, name, !cli[:query], body[:md])
|
227
|
-
cli.add_metadata(p.add_dataset(name))
|
228
|
-
end
|
229
|
-
end
|
230
132
|
end
|
data/lib/miga/cli/action/get.rb
CHANGED
@@ -17,7 +17,8 @@ class MiGA::Cli::Action::Get < MiGA::Cli::Action
|
|
17
17
|
) { |v| cli[:ids] = v }
|
18
18
|
opt.on(
|
19
19
|
'-U', '--universe STRING',
|
20
|
-
"Universe of the remote database. By default: #{cli[:universe]}"
|
20
|
+
"Universe of the remote database. By default: #{cli[:universe]}",
|
21
|
+
"Supported: #{MiGA::RemoteDataset.UNIVERSE.keys.join(', ')}"
|
21
22
|
) { |v| cli[:universe] = v.to_sym }
|
22
23
|
opt.on(
|
23
24
|
'--db STRING',
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'miga/cli/action'
|
4
|
+
|
5
|
+
class MiGA::Cli::Action::GtdbGet < MiGA::Cli::Action
|
6
|
+
require 'miga/cli/action/download/gtdb'
|
7
|
+
include MiGA::Cli::Action::Download::Gtdb
|
8
|
+
|
9
|
+
def parse_cli
|
10
|
+
cli.defaults = {
|
11
|
+
query: false, unlink: false,
|
12
|
+
reference: false, add_version: true, dry: false,
|
13
|
+
get_md: false, only_md: false, save_every: 1
|
14
|
+
}
|
15
|
+
cli.parse do |opt|
|
16
|
+
cli.opt_object(opt, [:project])
|
17
|
+
opt.on(
|
18
|
+
'-T', '--taxon STRING',
|
19
|
+
'(Mandatory) Taxon name in GTDB format (e.g., g__Escherichia)'
|
20
|
+
) { |v| cli[:taxon] = v }
|
21
|
+
opt.on(
|
22
|
+
'--max INT', Integer,
|
23
|
+
'Maximum number of datasets to download (by default: unlimited)'
|
24
|
+
) { |v| cli[:max_datasets] = v }
|
25
|
+
opt.on(
|
26
|
+
'-m', '--metadata STRING',
|
27
|
+
'Metadata as key-value pairs separated by = and delimited by comma',
|
28
|
+
'Values are saved as strings except for booleans (true / false) or nil'
|
29
|
+
) { |v| cli[:metadata] = v }
|
30
|
+
cli_task_flags(opt)
|
31
|
+
cli_name_modifiers(opt)
|
32
|
+
cli_filters(opt)
|
33
|
+
cli_save_actions(opt)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def perform
|
38
|
+
sanitize_cli
|
39
|
+
p = cli.load_project
|
40
|
+
ds = remote_list
|
41
|
+
ds = discard_blacklisted(ds)
|
42
|
+
ds = impose_limit(ds)
|
43
|
+
d, downloaded = download_entries(ds, p)
|
44
|
+
|
45
|
+
# Finalize
|
46
|
+
cli.say "Datasets listed: #{d.size}"
|
47
|
+
act = cli[:dry] ? 'to download' : 'downloaded'
|
48
|
+
cli.say "Datasets #{act}: #{downloaded}"
|
49
|
+
unless cli[:remote_list].nil?
|
50
|
+
File.open(cli[:remote_list], 'w') do |fh|
|
51
|
+
d.each { |i| fh.puts i }
|
52
|
+
end
|
53
|
+
end
|
54
|
+
return unless cli[:unlink]
|
55
|
+
|
56
|
+
unlink = p.dataset_names - d
|
57
|
+
unlink.each { |i| p.unlink_dataset(i).remove! }
|
58
|
+
cli.say "Datasets unlinked: #{unlink.size}"
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
@@ -3,8 +3,8 @@
|
|
3
3
|
require 'miga/cli/action'
|
4
4
|
|
5
5
|
class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
6
|
-
require 'miga/cli/action/
|
7
|
-
include MiGA::Cli::Action::
|
6
|
+
require 'miga/cli/action/download/ncbi'
|
7
|
+
include MiGA::Cli::Action::Download::Ncbi
|
8
8
|
|
9
9
|
def parse_cli
|
10
10
|
cli.defaults = {
|
data/lib/miga/cli/base.rb
CHANGED
@@ -19,6 +19,7 @@ module MiGA::Cli::Base
|
|
19
19
|
add: 'Create a dataset in a MiGA project',
|
20
20
|
get: 'Download a dataset from public databases into a MiGA project',
|
21
21
|
ncbi_get: 'Download all genomes in a taxon from NCBI into a MiGA project',
|
22
|
+
gtdb_get: 'Download all genomes in a taxon from GTDB into a MiGA project',
|
22
23
|
rm: 'Remove a dataset from a MiGA project',
|
23
24
|
find: 'Find unregistered datasets based on result files',
|
24
25
|
ln: 'Link datasets (including results) from one project to another',
|
data/lib/miga/dataset/result.rb
CHANGED
@@ -384,7 +384,10 @@ module MiGA::Dataset::Result
|
|
384
384
|
##
|
385
385
|
# Add result type +:stats+ at +base+ (no +_opts+ supported)
|
386
386
|
def add_result_stats(base, _opts)
|
387
|
-
|
387
|
+
add_files_to_ds_result(
|
388
|
+
MiGA::Result.new("#{base}.json"), name,
|
389
|
+
trna_list: '.trna.txt'
|
390
|
+
)
|
388
391
|
end
|
389
392
|
|
390
393
|
##
|
@@ -12,6 +12,8 @@ end
|
|
12
12
|
|
13
13
|
module MiGA::RemoteDataset::Base
|
14
14
|
@@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
15
|
+
@@_EBI_API = 'https://www.ebi.ac.uk/Tools'
|
16
|
+
@@_GTDB_API = 'https://api.gtdb.ecogenomic.org'
|
15
17
|
@@_NCBI_API_KEY = lambda { |url|
|
16
18
|
ENV['NCBI_API_KEY'].nil? ? url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}"
|
17
19
|
}
|
@@ -43,9 +45,26 @@ module MiGA::RemoteDataset::Base
|
|
43
45
|
},
|
44
46
|
ebi: {
|
45
47
|
dbs: { embl: { stage: :assembly, format: :fasta } },
|
46
|
-
url:
|
48
|
+
url: "#{@@_EBI_API}/dbfetch/dbfetch/%1$s/%2$s/%3$s",
|
47
49
|
method: :rest
|
48
50
|
},
|
51
|
+
gtdb: {
|
52
|
+
dbs: {
|
53
|
+
# This is a dummy entry plugged directly to +ncbi_asm_rest+
|
54
|
+
assembly: { stage: :assembly, format: :fasta_gz, getter: :ncbi_asm },
|
55
|
+
# The 'taxon' namespace actually returns a list of genomes (+format+)
|
56
|
+
taxon: {
|
57
|
+
stage: :metadata, format: :genomes, map_to: [:assembly],
|
58
|
+
extra: ['sp_reps_only=false']
|
59
|
+
},
|
60
|
+
# The 'genome' namespace actually returns the taxonomy (+format+)
|
61
|
+
genome: { stage: :metadata, format: 'taxon-history' }
|
62
|
+
},
|
63
|
+
url: "#{@@_GTDB_API}/%1$s/%2$s/%3$s?%4$s",
|
64
|
+
method: :rest,
|
65
|
+
map_to_universe: :ncbi,
|
66
|
+
headers: 'accept: application/json' # < TODO not currently supported
|
67
|
+
},
|
49
68
|
ncbi: {
|
50
69
|
dbs: {
|
51
70
|
nuccore: { stage: :assembly, format: :fasta, getter: :ncbi_gb },
|
data/lib/miga/remote_dataset.rb
CHANGED
@@ -49,7 +49,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
49
49
|
@@UNIVERSE.keys.include?(@universe) or
|
50
50
|
raise "Unknown Universe: #{@universe}. Try: #{@@UNIVERSE.keys}"
|
51
51
|
@@UNIVERSE[@universe][:dbs].include?(@db) or
|
52
|
-
raise "Unknown Database: #{@db}. Try: #{@@UNIVERSE[@universe][:dbs]}"
|
52
|
+
raise "Unknown Database: #{@db}. Try: #{@@UNIVERSE[@universe][:dbs].keys}"
|
53
53
|
@_ncbi_asm_json_doc = nil
|
54
54
|
# FIXME: Part of the +map_to+ support:
|
55
55
|
# unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
|
@@ -104,6 +104,9 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
104
104
|
when :ebi, :ncbi, :web
|
105
105
|
# Get taxonomy
|
106
106
|
@metadata[:tax] = get_ncbi_taxonomy
|
107
|
+
when :gtdb
|
108
|
+
# Get taxonomy
|
109
|
+
@metadata[:tax] = get_gtdb_taxonomy
|
107
110
|
end
|
108
111
|
@metadata = get_type_status(metadata)
|
109
112
|
end
|
@@ -129,10 +132,9 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
129
132
|
end
|
130
133
|
|
131
134
|
##
|
132
|
-
# Get NCBI taxonomy as MiGA::Taxonomy
|
135
|
+
# Get NCBI taxonomy as MiGA::Taxonomy
|
133
136
|
def get_ncbi_taxonomy
|
134
|
-
tax_id = get_ncbi_taxid
|
135
|
-
return nil if tax_id.nil?
|
137
|
+
tax_id = get_ncbi_taxid or return
|
136
138
|
|
137
139
|
lineage = { ns: 'ncbi' }
|
138
140
|
doc = MiGA::RemoteDataset.download(:ncbi, :taxonomy, tax_id, :xml)
|
@@ -147,12 +149,34 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
147
149
|
MiGA::Taxonomy.new(lineage)
|
148
150
|
end
|
149
151
|
|
152
|
+
##
|
153
|
+
# Get GTDB taxonomy as MiGA::Taxonomy
|
154
|
+
def get_gtdb_taxonomy
|
155
|
+
gtdb_genome = metadata[:gtdb_assembly] or return
|
156
|
+
|
157
|
+
doc = MiGA::Json.parse(
|
158
|
+
MiGA::RemoteDataset.download(
|
159
|
+
:gtdb, :genome, gtdb_genome, 'taxon-history', nil, ['']
|
160
|
+
),
|
161
|
+
contents: true
|
162
|
+
)
|
163
|
+
lineage = { ns: 'gtdb' }
|
164
|
+
lineage.merge!(doc.first) # Get only the latest available classification
|
165
|
+
release = lineage.delete(:release)
|
166
|
+
@metadata[:gtdb_release] = release
|
167
|
+
lineage.transform_values! { |v| v.gsub(/^\S__/, '') }
|
168
|
+
MiGA.DEBUG "Got lineage from #{release}: #{lineage}"
|
169
|
+
MiGA::Taxonomy.new(lineage)
|
170
|
+
end
|
171
|
+
|
150
172
|
##
|
151
173
|
# Get the JSON document describing an NCBI assembly entry.
|
152
174
|
def ncbi_asm_json_doc
|
153
175
|
return @_ncbi_asm_json_doc unless @_ncbi_asm_json_doc.nil?
|
154
176
|
|
155
|
-
|
177
|
+
if db == :assembly && %i[ncbi gtdb].include?(universe)
|
178
|
+
metadata[:ncbi_asm] ||= ids.first
|
179
|
+
end
|
156
180
|
return nil unless metadata[:ncbi_asm]
|
157
181
|
|
158
182
|
ncbi_asm_id = self.class.ncbi_asm_acc2id metadata[:ncbi_asm]
|
data/lib/miga/taxonomy.rb
CHANGED
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.2,
|
15
|
+
VERSION = [1.2, 12, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2022, 12,
|
23
|
+
VERSION_DATE = Date.new(2022, 12, 30)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
data/scripts/stats.bash
CHANGED
@@ -9,7 +9,32 @@ DIR="$PROJECT/data/90.stats"
|
|
9
9
|
cd "$DIR"
|
10
10
|
|
11
11
|
# Initialize
|
12
|
-
miga date > "$DATASET.start"
|
12
|
+
miga date > "${DATASET}.start"
|
13
|
+
|
14
|
+
# tRNAscan-SE
|
15
|
+
fa="../05.assembly/${DATASET}.LargeContigs.fna"
|
16
|
+
if [[ -s "$fa" ]] ; then
|
17
|
+
d="$(miga ls -P "$PROJECT" -D "$DATASET" -m tax:d | awk '{print $2}')"
|
18
|
+
if [[ "$d" == "Bacteria" || "$d" == "Archaea" || "$d" == "Eukaryota" ]] ; then
|
19
|
+
dom_opt="-$(echo "$d" | perl -pe 's/(\S).*/$1/')"
|
20
|
+
out="${DATASET}.trna.txt"
|
21
|
+
# `echo O` is to avoid a hang from a pre-existing output file.
|
22
|
+
# This is better than pre-checking (and removing), because it avoids
|
23
|
+
# the (unlikely) scenario of a file racing (e.g., a file created right
|
24
|
+
# before tRNAscan-SE starts, or a `rm` failure).
|
25
|
+
#
|
26
|
+
# The trailing `|| true` is to treat failure as non-fatal
|
27
|
+
echo O | tRNAscan-SE $dom_opt -o "$out" -q "$fa" || true
|
28
|
+
if [[ -s "$out" ]] ; then
|
29
|
+
cnt=$(tail -n +4 "$out" | wc -l | awk '{print $1}')
|
30
|
+
aa="$(tail -n +4 "$out" | grep -v 'pseudo$' | awk '{print $5}' \
|
31
|
+
| grep -v 'Undet' | perl -pe 's/^f?([A-Za-z]+)[0-9]?/$1/' \
|
32
|
+
| sort | uniq | wc -l | awk '{print $1}')"
|
33
|
+
miga edit -P "$PROJECT" -D "$DATASET" \
|
34
|
+
-m "trna_count=Int($cnt),trna_aa=Int($aa)"
|
35
|
+
fi
|
36
|
+
fi
|
37
|
+
fi
|
13
38
|
|
14
39
|
# Calculate statistics
|
15
40
|
for i in raw_reads trimmed_fasta assembly cds essential_genes ssu distances taxonomy ; do
|
data/utils/requirements.txt
CHANGED
@@ -20,3 +20,4 @@ Fastp (reads) fastp https://github.com/OpenGene/fastp
|
|
20
20
|
Temurin (rdp) java https://adoptium.net/ Any Java VM would work
|
21
21
|
MyTaxa (mytaxa) MyTaxa http://enve-omics.ce.gatech.edu/mytaxa
|
22
22
|
Krona (mytaxa) ktImportText https://github.com/marbl/Krona/wiki
|
23
|
+
tRNAscan-SE tRNAscan-SE http://trna.ucsc.edu/tRNAscan-SE/ Required version: 2+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-12-
|
11
|
+
date: 2022-12-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|
@@ -141,6 +141,9 @@ files:
|
|
141
141
|
- lib/miga/cli/action/derep_wf.rb
|
142
142
|
- lib/miga/cli/action/doctor.rb
|
143
143
|
- lib/miga/cli/action/doctor/base.rb
|
144
|
+
- lib/miga/cli/action/download/base.rb
|
145
|
+
- lib/miga/cli/action/download/gtdb.rb
|
146
|
+
- lib/miga/cli/action/download/ncbi.rb
|
144
147
|
- lib/miga/cli/action/edit.rb
|
145
148
|
- lib/miga/cli/action/env.rb
|
146
149
|
- lib/miga/cli/action/files.rb
|
@@ -148,6 +151,7 @@ files:
|
|
148
151
|
- lib/miga/cli/action/generic.rb
|
149
152
|
- lib/miga/cli/action/get.rb
|
150
153
|
- lib/miga/cli/action/get_db.rb
|
154
|
+
- lib/miga/cli/action/gtdb_get.rb
|
151
155
|
- lib/miga/cli/action/index_wf.rb
|
152
156
|
- lib/miga/cli/action/init.rb
|
153
157
|
- lib/miga/cli/action/init/daemon_helper.rb
|
@@ -156,7 +160,6 @@ files:
|
|
156
160
|
- lib/miga/cli/action/ln.rb
|
157
161
|
- lib/miga/cli/action/ls.rb
|
158
162
|
- lib/miga/cli/action/ncbi_get.rb
|
159
|
-
- lib/miga/cli/action/ncbi_get/downloads.rb
|
160
163
|
- lib/miga/cli/action/new.rb
|
161
164
|
- lib/miga/cli/action/next_step.rb
|
162
165
|
- lib/miga/cli/action/option.rb
|