miga-base 1.2.16.1 → 1.2.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +2 -2
- data/lib/miga/cli/action/download/seqcode.rb +69 -0
- data/lib/miga/cli/action/seqcode_get.rb +37 -0
- data/lib/miga/cli/action/wf.rb +36 -14
- data/lib/miga/cli/base.rb +1 -0
- data/lib/miga/common/net.rb +9 -3
- data/lib/miga/remote_dataset/base.rb +13 -0
- data/lib/miga/remote_dataset.rb +4 -1
- data/lib/miga/version.rb +2 -2
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a33ac43db8c09ad7cad760c1bc82102ca1629beba12bea7c15d5391569210d95
|
4
|
+
data.tar.gz: 5104348e8471729c6ce59612a88e624f44d382480e313e00329e8766ead0e8a5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fd9c726cc1da17dc2146772491a580d471739876f036368fb5a2c2fab475604f0ae858b3e02cebcb7dd22c029d01ade2010a68acb86e752a85e7723dabcebec0
|
7
|
+
data.tar.gz: 898c7b240b5d1c8e8f911f2111d9e326752cdb25b36c646aa21402c140c52187382fc9bf4cf621b5a0b3cfc715f0d596beefaaece3827822b4ad515d6e420aac
|
data/LICENSE
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
|
1
|
+
The Artistic License 2.0
|
2
2
|
|
3
|
-
|
3
|
+
Copyright (c) 2016-2023 Luis M Rodriguez-R (Universität Innsbruck)
|
4
4
|
|
5
5
|
Everyone is permitted to copy and distribute verbatim copies
|
6
6
|
of this license document, but changing it is not allowed.
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'miga/cli/action/download/base'
|
4
|
+
|
5
|
+
##
|
6
|
+
# Helper module including download functions for the seqcode_get action
|
7
|
+
module MiGA::Cli::Action::Download::Seqcode
|
8
|
+
include MiGA::Cli::Action::Download::Base
|
9
|
+
|
10
|
+
def cli_task_flags(_opt)
|
11
|
+
end
|
12
|
+
|
13
|
+
def cli_name_modifiers(opt)
|
14
|
+
opt.on(
|
15
|
+
'--no-version-name',
|
16
|
+
'Do not add sequence version to the dataset name'
|
17
|
+
) { |v| cli[:add_version] = v }
|
18
|
+
end
|
19
|
+
|
20
|
+
def sanitize_cli
|
21
|
+
cli[:save_every] = 1 if cli[:dry]
|
22
|
+
end
|
23
|
+
|
24
|
+
def remote_list
|
25
|
+
cli.say 'Downloading genome list'
|
26
|
+
current_page = 1
|
27
|
+
total_pages = 1
|
28
|
+
ds = {}
|
29
|
+
|
30
|
+
while current_page <= total_pages
|
31
|
+
json = MiGA::RemoteDataset.download(
|
32
|
+
:seqcode, :'type-genomes', nil, :json, nil,
|
33
|
+
["page=#{current_page}"]
|
34
|
+
)
|
35
|
+
doc = MiGA::Json.parse(json, contents: true)
|
36
|
+
current_page = doc[:current_page] + 1
|
37
|
+
total_pages = doc[:total_pages]
|
38
|
+
|
39
|
+
doc[:values].each do |name|
|
40
|
+
next unless name[:type_material]
|
41
|
+
acc = name[:type_material].values.first
|
42
|
+
db = name[:type_material].keys.first
|
43
|
+
next unless %i[assembly nuccore].include?(db) # No INSDC genome, ignore
|
44
|
+
|
45
|
+
classif = name[:classification] || {}
|
46
|
+
tax = MiGA::Taxonomy.new(Hash[classif.map { |i| [i[:rank], i[:name]] }])
|
47
|
+
tax << { 'ns' => 'seqcode', name[:rank] => name[:name] }
|
48
|
+
d = {
|
49
|
+
ids: [acc], db: db, universe: :seqcode,
|
50
|
+
md: {
|
51
|
+
type: :genome, tax: tax, is_type: true,
|
52
|
+
type_rel: 'SeqCode type genome',
|
53
|
+
seqcode_url: "https://seqco.de/i:#{name[:id]}"
|
54
|
+
}
|
55
|
+
}
|
56
|
+
d[:md][:get_ncbi_taxonomy] = true if cli[:get_ncbi_taxonomy]
|
57
|
+
ds[remote_row_name(tax, db, acc)] = d
|
58
|
+
end
|
59
|
+
end
|
60
|
+
ds
|
61
|
+
end
|
62
|
+
|
63
|
+
def remote_row_name(tax, db, asm)
|
64
|
+
acc = asm.to_s
|
65
|
+
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
66
|
+
db_short = { assembly: 'asm', nuccore: 'gb' }[db]
|
67
|
+
"#{tax.lowest[1]}_#{db_short}_#{acc}".miga_name
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'miga/cli/action'
|
4
|
+
|
5
|
+
class MiGA::Cli::Action::SeqcodeGet < MiGA::Cli::Action
|
6
|
+
require 'miga/cli/action/download/seqcode'
|
7
|
+
include MiGA::Cli::Action::Download::Seqcode
|
8
|
+
|
9
|
+
def parse_cli
|
10
|
+
cli.defaults = {
|
11
|
+
query: false, unlink: false,
|
12
|
+
reference: false, add_version: true, dry: false,
|
13
|
+
get_md: false, only_md: false, save_every: 1
|
14
|
+
}
|
15
|
+
cli.parse do |opt|
|
16
|
+
cli.opt_object(opt, [:project])
|
17
|
+
cli_base_flags(opt)
|
18
|
+
opt.on(
|
19
|
+
'--ncbi-taxonomy',
|
20
|
+
'Retrieve NCBI taxonomy instead of SeqCode taxonomy'
|
21
|
+
) { |v| cli[:get_ncbi_taxonomy] = v }
|
22
|
+
cli_task_flags(opt)
|
23
|
+
cli_name_modifiers(opt)
|
24
|
+
cli_filters(opt)
|
25
|
+
cli_save_actions(opt)
|
26
|
+
opt.on(
|
27
|
+
'--ncbi-api-key STRING',
|
28
|
+
'NCBI API key'
|
29
|
+
) { |v| ENV['NCBI_API_KEY'] = v }
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def perform
|
34
|
+
generic_perform
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
data/lib/miga/cli/action/wf.rb
CHANGED
@@ -47,10 +47,15 @@ module MiGA::Cli::Action::Wf
|
|
47
47
|
'--gtdb-ref',
|
48
48
|
'Only download reference anchor genomes in GTDB (requires -G)'
|
49
49
|
) { |v| cli[:gtdb_ref] = v }
|
50
|
+
opt.on(
|
51
|
+
'-S', '--seqcode-type',
|
52
|
+
'Download all type genomes from the SeqCode Registry'
|
53
|
+
) { |v| cli[:seqcode_type] = v }
|
50
54
|
opt.on(
|
51
55
|
'--max-download INT', Integer,
|
52
|
-
'Maximum number of genomes to download (by default: unlimited)'
|
53
|
-
|
56
|
+
'Maximum number of genomes to download (by default: unlimited)',
|
57
|
+
'It applies independently to -T, -G and --S'
|
58
|
+
) { |v| cli[:max_download] = v }
|
54
59
|
end
|
55
60
|
if params[:qual]
|
56
61
|
opt.on(
|
@@ -131,24 +136,39 @@ module MiGA::Cli::Action::Wf
|
|
131
136
|
paired = cli[:input_type].to_s.include?('_paired')
|
132
137
|
cli[:regexp] ||= MiGA::Cli.FILE_REGEXP(paired)
|
133
138
|
|
134
|
-
# Create empty project
|
139
|
+
# Create empty project and populate with datasets
|
140
|
+
p = initialize_empty_project(p_metadata)
|
141
|
+
download_datasets
|
142
|
+
import_datasets(stage)
|
143
|
+
|
144
|
+
# Define datasets metadata
|
145
|
+
p.load
|
146
|
+
d_metadata[:type] = cli[:dataset_type]
|
147
|
+
p.each_dataset { |d| transfer_metadata(d, d_metadata) }
|
148
|
+
p
|
149
|
+
end
|
150
|
+
|
151
|
+
def initialize_empty_project(metadata)
|
135
152
|
call_cli(
|
136
153
|
['new', '-P', cli[:outdir], '-t', cli[:project_type]]
|
137
154
|
) unless MiGA::Project.exist? cli[:outdir]
|
138
155
|
|
139
156
|
# Define project metadata
|
140
157
|
p = cli.load_project(:outdir, '-o')
|
141
|
-
|
142
|
-
transfer_metadata(p,
|
158
|
+
metadata[:type] = cli[:project_type]
|
159
|
+
transfer_metadata(p, metadata)
|
143
160
|
%i[haai_p aai_p ani_p ess_coll min_qual].each do |i|
|
144
161
|
p.set_option(i, cli[i])
|
145
162
|
end
|
163
|
+
p
|
164
|
+
end
|
146
165
|
|
166
|
+
def download_datasets
|
147
167
|
# Download datasets from NCBI
|
148
168
|
unless cli[:ncbi_taxon].nil?
|
149
169
|
what = cli[:ncbi_draft] ? '--all' : '--complete'
|
150
170
|
cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
|
151
|
-
cmd += ['--max', cli[:
|
171
|
+
cmd += ['--max', cli[:max_download]] if cli[:max_download]
|
152
172
|
call_cli(cmd)
|
153
173
|
end
|
154
174
|
|
@@ -156,11 +176,19 @@ module MiGA::Cli::Action::Wf
|
|
156
176
|
unless cli[:gtdb_taxon].nil?
|
157
177
|
cmd = ['gtdb_get', '-P', cli[:outdir], '-T', cli[:gtdb_taxon]]
|
158
178
|
cmd << '--reference' if cli[:gtdb_ref]
|
159
|
-
cmd += ['--max', cli[:
|
179
|
+
cmd += ['--max', cli[:max_download]] if cli[:max_download]
|
160
180
|
call_cli(cmd)
|
161
181
|
end
|
162
182
|
|
163
|
-
#
|
183
|
+
# Download datasets from SeqCode Registry
|
184
|
+
if cli[:seqcode_type]
|
185
|
+
cmd = ['seqcode_get', '-P', cli[:outdir]]
|
186
|
+
cmd += ['--max', cli[:max_download]] if cli[:max_download]
|
187
|
+
call_cli(cmd)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def import_datasets(stage)
|
164
192
|
call_cli(
|
165
193
|
[
|
166
194
|
'add',
|
@@ -171,12 +199,6 @@ module MiGA::Cli::Action::Wf
|
|
171
199
|
'-R', cli[:regexp]
|
172
200
|
] + cli.files
|
173
201
|
) unless cli.files.empty?
|
174
|
-
|
175
|
-
# Define datasets metadata
|
176
|
-
p.load
|
177
|
-
d_metadata[:type] = cli[:dataset_type]
|
178
|
-
p.each_dataset { |d| transfer_metadata(d, d_metadata) }
|
179
|
-
p
|
180
202
|
end
|
181
203
|
|
182
204
|
def summarize(which = %w[cds assembly essential_genes ssu])
|
data/lib/miga/cli/base.rb
CHANGED
@@ -20,6 +20,7 @@ module MiGA::Cli::Base
|
|
20
20
|
get: 'Download a dataset from public databases into a MiGA project',
|
21
21
|
ncbi_get: 'Download all genomes in a taxon from NCBI into a MiGA project',
|
22
22
|
gtdb_get: 'Download all genomes in a taxon from GTDB into a MiGA project',
|
23
|
+
seqcode_get: 'Download all type genomes from SeqCode into a MiGA project',
|
23
24
|
rm: 'Remove a dataset from a MiGA project',
|
24
25
|
find: 'Find unregistered datasets based on result files',
|
25
26
|
ln: 'Link datasets (including results) from one project to another',
|
data/lib/miga/common/net.rb
CHANGED
@@ -15,16 +15,22 @@ module MiGA::Common::Net
|
|
15
15
|
def known_hosts(name)
|
16
16
|
case name.to_sym
|
17
17
|
when :miga_online_ftp
|
18
|
-
|
18
|
+
"ftp://#{main_server}//" # <- // to simplify chdir in connection
|
19
19
|
when :miga_db
|
20
|
-
|
20
|
+
"ftp://#{main_server}/db"
|
21
21
|
when :miga_dist
|
22
|
-
|
22
|
+
"ftp://#{main_server}/dist"
|
23
23
|
else
|
24
24
|
raise "Unrecognized server name: #{host}"
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
28
|
+
##
|
29
|
+
# Returns the address of the main MiGA server
|
30
|
+
def main_server
|
31
|
+
'gatech.microbial-genomes.org'
|
32
|
+
end
|
33
|
+
|
28
34
|
##
|
29
35
|
# Connect to an FTP +host+ (String) or a known host name (Symbol, see
|
30
36
|
# +.known_hosts+)
|
@@ -14,6 +14,7 @@ module MiGA::RemoteDataset::Base
|
|
14
14
|
@@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
15
15
|
@@_EBI_API = 'https://www.ebi.ac.uk/Tools'
|
16
16
|
@@_GTDB_API = 'https://api.gtdb.ecogenomic.org'
|
17
|
+
@@_SEQCODE_API = 'https://disc-genomics.uibk.ac.at/seqcode'
|
17
18
|
@@_NCBI_API_KEY = lambda { |url|
|
18
19
|
ENV['NCBI_API_KEY'].nil? ? url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}"
|
19
20
|
}
|
@@ -65,6 +66,18 @@ module MiGA::RemoteDataset::Base
|
|
65
66
|
map_to_universe: :ncbi,
|
66
67
|
headers: 'accept: application/json' # < TODO not currently supported
|
67
68
|
},
|
69
|
+
seqcode: {
|
70
|
+
dbs: {
|
71
|
+
# These are dummy entries plugged directly to +ncbi_*_rest+
|
72
|
+
assembly: { stage: :assembly, format: :fasta_gz, getter: :ncbi_asm },
|
73
|
+
nuccore: { stage: :assembly, format: :fasta, getter: :ncbi_gb },
|
74
|
+
# This is the list of type genomes
|
75
|
+
:'type-genomes' => { stage: :metadata, format: :json }
|
76
|
+
},
|
77
|
+
url: "#{@@_SEQCODE_API}/%1$s.json?%4$s",
|
78
|
+
method: :rest,
|
79
|
+
map_to_universe: :ncbi
|
80
|
+
},
|
68
81
|
ncbi: {
|
69
82
|
dbs: {
|
70
83
|
nuccore: { stage: :assembly, format: :fasta, getter: :ncbi_gb },
|
data/lib/miga/remote_dataset.rb
CHANGED
@@ -133,6 +133,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
133
133
|
when :gtdb
|
134
134
|
# Get taxonomy
|
135
135
|
@metadata[:tax] = get_gtdb_taxonomy
|
136
|
+
when :seqcode
|
137
|
+
# Do nothing, taxonomy already defined
|
136
138
|
end
|
137
139
|
|
138
140
|
if metadata[:get_ncbi_taxonomy]
|
@@ -206,7 +208,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
206
208
|
def ncbi_asm_json_doc
|
207
209
|
return @_ncbi_asm_json_doc unless @_ncbi_asm_json_doc.nil?
|
208
210
|
|
209
|
-
if db == :assembly && %i[ncbi gtdb].include?(universe)
|
211
|
+
if db == :assembly && %i[ncbi gtdb seqcode].include?(universe)
|
210
212
|
metadata[:ncbi_asm] ||= ids.first
|
211
213
|
end
|
212
214
|
return nil unless metadata[:ncbi_asm]
|
@@ -244,6 +246,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
244
246
|
ln
|
245
247
|
end
|
246
248
|
|
249
|
+
alias :get_ncbi_taxid_from_seqcode :get_ncbi_taxid_from_ncbi
|
247
250
|
alias :get_ncbi_taxid_from_gtdb :get_ncbi_taxid_from_ncbi
|
248
251
|
|
249
252
|
def get_ncbi_taxid_from_ebi
|
data/lib/miga/version.rb
CHANGED
@@ -12,7 +12,7 @@ module MiGA
|
|
12
12
|
# - String indicating release status:
|
13
13
|
# - rc* release candidate, not released as gem
|
14
14
|
# - [0-9]+ stable release, released as gem
|
15
|
-
VERSION = [1.2,
|
15
|
+
VERSION = [1.2, 17, 0].freeze
|
16
16
|
|
17
17
|
##
|
18
18
|
# Nickname for the current major.minor version.
|
@@ -20,7 +20,7 @@ module MiGA
|
|
20
20
|
|
21
21
|
##
|
22
22
|
# Date of the current gem relese.
|
23
|
-
VERSION_DATE = Date.new(2023, 2,
|
23
|
+
VERSION_DATE = Date.new(2023, 2, 14)
|
24
24
|
|
25
25
|
##
|
26
26
|
# References of MiGA
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.17.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-02-
|
11
|
+
date: 2023-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|
@@ -147,6 +147,7 @@ files:
|
|
147
147
|
- lib/miga/cli/action/download/base.rb
|
148
148
|
- lib/miga/cli/action/download/gtdb.rb
|
149
149
|
- lib/miga/cli/action/download/ncbi.rb
|
150
|
+
- lib/miga/cli/action/download/seqcode.rb
|
150
151
|
- lib/miga/cli/action/edit.rb
|
151
152
|
- lib/miga/cli/action/env.rb
|
152
153
|
- lib/miga/cli/action/files.rb
|
@@ -170,6 +171,7 @@ files:
|
|
170
171
|
- lib/miga/cli/action/quality_wf.rb
|
171
172
|
- lib/miga/cli/action/rm.rb
|
172
173
|
- lib/miga/cli/action/run.rb
|
174
|
+
- lib/miga/cli/action/seqcode_get.rb
|
173
175
|
- lib/miga/cli/action/stats.rb
|
174
176
|
- lib/miga/cli/action/summary.rb
|
175
177
|
- lib/miga/cli/action/tax_dist.rb
|