miga-base 0.3.7.1 → 0.3.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -3
- data/actions/init.rb +3 -3
- data/actions/ncbi_get.rb +82 -87
- data/bin/miga +2 -1
- data/lib/miga/daemon.rb +7 -7
- data/lib/miga/dataset/result.rb +1 -1
- data/lib/miga/remote_dataset/base.rb +24 -10
- data/lib/miga/remote_dataset/download.rb +43 -18
- data/lib/miga/remote_dataset.rb +46 -23
- data/lib/miga/result/dates.rb +3 -3
- data/lib/miga/version.rb +2 -2
- data/test/daemon_test.rb +2 -2
- data/utils/distance/database.rb +1 -1
- data/utils/subclades.R +21 -11
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c7f7bb9ba42ccdbed81ce05484031e17a43c6e688e89c5622327aadbee9d0f31
|
4
|
+
data.tar.gz: 3ca5e3189bb65b213fe43a948710d69ea304dac4c838c176838773fc87a88c5b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9fa0deb9770be85a71145fcedd3a8a590ad7a6584f274245ed47df7decde6ddcb8d48c635585773a7005ceca4b03d37e67620346faff3eea387803d113a726f8
|
7
|
+
data.tar.gz: 17eeaad6dc985ff89d813ece94d3e2837fac4a40223cde14063b3236ba5dbd7ff581c6e7df30a4919cb002dd6918a42ac820cf8a3045f99714486f05bbc252fe
|
data/README.md
CHANGED
@@ -11,9 +11,10 @@
|
|
11
11
|
|
12
12
|
For additional information on MiGA, visit:
|
13
13
|
|
14
|
+
* [MiGA Online][miga-online]: The Microbial Genomes Atlas Online.
|
14
15
|
* [MiGA users list][mailing-list]:
|
15
16
|
Forum to discuss with other users and developers.
|
16
|
-
* [MiGA manual][
|
17
|
+
* [MiGA manual][manual]: The definitive guide to MiGA.
|
17
18
|
* [MiGA API docs][rubydoc]: Inner-workings of the `miga-base` gem.
|
18
19
|
* [MiGA Web][miga-web]: MiGA on Rails!
|
19
20
|
|
@@ -46,8 +47,8 @@ Technology and [RDP][rdp] at Michigan State University.
|
|
46
47
|
See [LICENSE](LICENSE).
|
47
48
|
|
48
49
|
[lrr]: http://lmrodriguezr.github.io/
|
49
|
-
[mailing-list]:
|
50
|
-
[
|
50
|
+
[mailing-list]: http://support.microbial-genomes.org/
|
51
|
+
[manual]: https://manual.microbial-genomes.org/
|
51
52
|
[rubydoc]: http://www.rubydoc.info/github/bio-miga/miga
|
52
53
|
[contact]: http://enve-omics.gatech.edu/node/7
|
53
54
|
[miga-web]: https://github.com/bio-miga/miga-web
|
data/actions/init.rb
CHANGED
@@ -229,7 +229,7 @@ unless File.exist?(daemon_f) and ask_user(
|
|
229
229
|
v[:latency] = ask_user(
|
230
230
|
'How long should I sleep? (in seconds)', '150').to_i
|
231
231
|
v[:maxjobs] = ask_user('How many jobs can I launch at once?', '300').to_i
|
232
|
-
v[:ppn] = ask_user('How many CPUs can I use per job?', '
|
232
|
+
v[:ppn] = ask_user('How many CPUs can I use per job?', '2').to_i
|
233
233
|
$stderr.puts 'Setting up internal daemon defaults.'
|
234
234
|
$stderr.puts 'If you don\'t understand this just leave default values:'
|
235
235
|
v[:cmd] = ask_user(
|
@@ -245,7 +245,7 @@ unless File.exist?(daemon_f) and ask_user(
|
|
245
245
|
v[:alive] = ask_user(
|
246
246
|
"How can I know that a process is still alive?\n %1$s: job id, " +
|
247
247
|
"output should be 1 for running and 0 for non-running.\n",
|
248
|
-
"squeue -h -o
|
248
|
+
"squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " +
|
249
249
|
"| tail -n 1 | wc -l")
|
250
250
|
v[:kill] = ask_user(
|
251
251
|
"How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
|
@@ -254,7 +254,7 @@ unless File.exist?(daemon_f) and ask_user(
|
|
254
254
|
v[:latency] = ask_user(
|
255
255
|
'How long should I sleep? (in seconds)', '150').to_i
|
256
256
|
v[:maxjobs] = ask_user('How many jobs can I launch at once?', '300').to_i
|
257
|
-
v[:ppn] = ask_user('How many CPUs can I use per job?', '
|
257
|
+
v[:ppn] = ask_user('How many CPUs can I use per job?', '2').to_i
|
258
258
|
$stderr.puts 'Setting up internal daemon defaults.'
|
259
259
|
$stderr.puts 'If you don\'t understand this just leave default values:'
|
260
260
|
v[:cmd] = ask_user(
|
data/actions/ncbi_get.rb
CHANGED
@@ -4,26 +4,25 @@
|
|
4
4
|
# @license Artistic-2.0
|
5
5
|
|
6
6
|
require 'miga/remote_dataset'
|
7
|
+
require 'csv'
|
7
8
|
|
8
9
|
o = {q:true, query:false, unlink:false,
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
reference: false, legacy_name: false,
|
11
|
+
complete: false, chromosome: false,
|
12
|
+
scaffold: false, contig: false, add_version: true, dry: false,
|
13
|
+
get_md: false}
|
13
14
|
OptionParser.new do |opt|
|
14
15
|
opt_banner(opt)
|
15
16
|
opt_object(opt, o, [:project])
|
16
17
|
opt.on('-T', '--taxon STRING',
|
17
|
-
|
18
|
-
|
18
|
+
'(Mandatory unless --reference) Taxon name (e.g., a species binomial).'
|
19
|
+
){ |v| o[:taxon]=v }
|
19
20
|
opt.on('--reference',
|
20
|
-
|
21
|
-
|
22
|
-
'If passed, ignores plasmids (only for --reference).'
|
23
|
-
){ |v| o[:ignore_plasmids]=v }
|
21
|
+
'Download all reference genomes (ignores any other status).'
|
22
|
+
){ |v| o[:reference]=v }
|
24
23
|
opt.on('--complete', 'Download complete genomes.'){ |v| o[:complete]=v }
|
25
24
|
opt.on('--chromosome',
|
26
|
-
|
25
|
+
'Download complete chromosomes.'){ |v| o[:chromosome]=v }
|
27
26
|
opt.on('--scaffold', 'Download genomes in scaffolds.'){ |v| o[:scaffold]=v }
|
28
27
|
opt.on('--contig', 'Download genomes in contigs.'){ |v| o[:contig]=v }
|
29
28
|
opt.on('--all', 'Download all genomes (in any status).') do
|
@@ -33,23 +32,26 @@ OptionParser.new do |opt|
|
|
33
32
|
o[:contig] = true
|
34
33
|
end
|
35
34
|
opt.on('--no-version-name',
|
36
|
-
|
37
|
-
|
35
|
+
'Do not add sequence version to the dataset name.',
|
36
|
+
'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
|
37
|
+
opt.on('--legacy-name',
|
38
|
+
'Use dataset names based on chromosome entries instead of assembly.'
|
39
|
+
){ |v| o[:legacy_name] = v }
|
38
40
|
opt.on('--blacklist PATH',
|
39
|
-
|
41
|
+
'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
|
40
42
|
opt.on('--dry', 'Do not download or save the datasets.'){ |v| o[:dry] = v }
|
41
43
|
opt.on('--get-metadata',
|
42
|
-
|
43
|
-
|
44
|
+
'Only download and update metadata for existing datasets'
|
45
|
+
){ |v| o[:get_md] = v }
|
44
46
|
opt.on('-q', '--query',
|
45
|
-
|
46
|
-
|
47
|
+
'Register the datasets as queries, not reference datasets.'
|
48
|
+
){ |v| o[:query]=v }
|
47
49
|
opt.on('-u', '--unlink',
|
48
|
-
|
49
|
-
|
50
|
+
'Unlink all datasets in the project missing from the download list.'
|
51
|
+
){ |v| o[:unlink]=v }
|
50
52
|
opt.on('-R', '--remote-list PATH',
|
51
|
-
|
52
|
-
|
53
|
+
'Path to an output file with the list of all datasets listed remotely.'
|
54
|
+
){ |v| o[:remote_list]=v }
|
53
55
|
opt.on('--api-key STRING', 'NCBI API key.'){ |v| ENV['NCBI_API_KEY'] = v }
|
54
56
|
opt_common(opt, o)
|
55
57
|
end.parse!
|
@@ -68,85 +70,78 @@ d = []
|
|
68
70
|
ds = {}
|
69
71
|
downloaded = 0
|
70
72
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
end
|
83
|
-
|
84
|
-
# Download IDs with reference status
|
73
|
+
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
|
74
|
+
url_param = {
|
75
|
+
q: '[display()].' +
|
76
|
+
'from(GenomeAssemblies).' +
|
77
|
+
'usingschema(/schema/GenomeAssemblies).' +
|
78
|
+
'matching(tab==["Prokaryotes"] and q=="' + o[:taxon].tr('"',"'") + '"',
|
79
|
+
fields: 'organism|organism,assembly|assembly,replicons|replicons,' +
|
80
|
+
'level|level,ftp_path_genbank|ftp_path_genbank,release_date|release_date,' +
|
81
|
+
'strain|strain',
|
82
|
+
nolimit: 'on',
|
83
|
+
}
|
85
84
|
if o[:reference]
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
next if ids.empty?
|
96
|
-
n = r[2].miga_name
|
97
|
-
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
98
|
-
end
|
85
|
+
url_param[:q] += ' and refseq_category==["representative"]'
|
86
|
+
else
|
87
|
+
status = {
|
88
|
+
complete: 'Complete',
|
89
|
+
chromosome: ' Chromosome', # <- The leading space is *VERY* important!
|
90
|
+
scaffold: 'Scaffold',
|
91
|
+
contig: 'Contig'
|
92
|
+
}.map { |k, v| '"' + v + '"' if o[k] }.compact.join(',')
|
93
|
+
url_param[:q] += ' and level==[' + status + ']'
|
99
94
|
end
|
95
|
+
url_param[:q] += ')'
|
96
|
+
url = url_base + URI.encode_www_form(url_param)
|
97
|
+
$stderr.puts 'Downloading genome list' unless o[:q]
|
98
|
+
lineno = 0
|
99
|
+
doc = MiGA::RemoteDataset.download_url(url)
|
100
|
+
CSV.parse(doc, headers: true).each do |r|
|
101
|
+
asm = r['assembly']
|
102
|
+
next if asm.nil? or asm.empty? or asm == '-'
|
100
103
|
|
101
|
-
#
|
102
|
-
|
103
|
-
|
104
|
-
'50|40' : o[:complete] ? '50' : '40')
|
105
|
-
$stderr.puts 'Downloading complete/chromosome genomes' unless o[:q]
|
106
|
-
lineno = 0
|
107
|
-
get_list(o[:taxon], status).each_line do |ln|
|
108
|
-
next if (lineno+=1)==1
|
109
|
-
r = ln.chomp.split("\t")
|
110
|
-
next if r[10].nil? or r[10].empty?
|
111
|
-
ids = r[10].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').split(';')
|
112
|
-
ids.delete_if{ |i| i =~ /\A\-*\z/ }
|
113
|
-
next if ids.empty?
|
114
|
-
acc = o[:add_version] ? ids[0] : ids[0].gsub(/\.\d+\Z/,'')
|
115
|
-
n = "#{r[0]}_#{acc}".miga_name
|
116
|
-
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
117
|
-
end
|
118
|
-
end
|
104
|
+
# Get replicons
|
105
|
+
rep = r['replicons'].nil? ? nil : r['replicons'].
|
106
|
+
split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
|
119
107
|
|
120
|
-
#
|
121
|
-
if o[:
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
ids = r[19].gsub(/\s/,'').split(';').delete_if{ |i| i =~ /\A\-*\z/ }.
|
132
|
-
map{ |i| "#{i}/#{File.basename(i)}_genomic.fna.gz" }
|
133
|
-
next if ids.empty?
|
134
|
-
n = "#{r[0]}_#{asm}".miga_name
|
135
|
-
asm.gsub!(/\(.*\)/, '')
|
136
|
-
ds[n] = {ids: ids, md: {type: :genome, ncbi_asm: asm},
|
137
|
-
db: :assembly_gz, universe: :web}
|
108
|
+
# Set name
|
109
|
+
if o[:legacy_name] and o[:reference]
|
110
|
+
n = r['#organism'].miga_name
|
111
|
+
else
|
112
|
+
if o[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
|
113
|
+
acc = rep.nil? ? '' : rep.first
|
114
|
+
else
|
115
|
+
acc = asm
|
116
|
+
end
|
117
|
+
acc.gsub!(/\.\d+\Z/, '') unless o[:add_version]
|
118
|
+
n = "#{r['#organism']}_#{acc}".miga_name
|
138
119
|
end
|
120
|
+
|
121
|
+
# Register for download
|
122
|
+
fna_url = r['ftp_path_genbank'] + '/' +
|
123
|
+
File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
|
124
|
+
ds[n] = {
|
125
|
+
ids: [fna_url], db: :assembly_gz, universe: :web,
|
126
|
+
md: {
|
127
|
+
type: :genome, ncbi_asm: asm, strain: r['strain']
|
128
|
+
}
|
129
|
+
}
|
130
|
+
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
131
|
+
ds[n][:md][:release_date] =
|
132
|
+
Time.parse(r['release_date']).to_s unless r['release_date'].nil?
|
139
133
|
end
|
140
134
|
|
141
135
|
# Discard blacklisted
|
142
136
|
unless o[:blacklist].nil?
|
143
137
|
$stderr.puts "Discarding datasets in #{o[:blacklist]}." unless o[:q]
|
144
|
-
File.readlines(o[:blacklist]).
|
138
|
+
File.readlines(o[:blacklist]).
|
139
|
+
select{ |i| i !~ /^#/ }.map(&:chomp).each{ |i| ds.delete i }
|
145
140
|
end
|
146
141
|
|
147
142
|
# Download entries
|
148
143
|
$stderr.puts "Downloading #{ds.size} " +
|
149
|
-
(ds.size
|
144
|
+
(ds.size == 1 ? "entry" : "entries") unless o[:q]
|
150
145
|
ds.each do |name,body|
|
151
146
|
d << name
|
152
147
|
puts name
|
data/bin/miga
CHANGED
@@ -126,7 +126,8 @@ end
|
|
126
126
|
def opt_common(opt, o)
|
127
127
|
opt.on("-v", "--verbose",
|
128
128
|
"Print additional information to STDERR."){ o[:q]=false }
|
129
|
-
opt.on("-d", "--debug INT",
|
129
|
+
opt.on("-d", "--debug INT",
|
130
|
+
"Print debugging information to STDERR (1: debug, 2: trace).") do |v|
|
130
131
|
v.to_i>1 ? MiGA::MiGA.DEBUG_TRACE_ON : MiGA::MiGA.DEBUG_ON
|
131
132
|
end
|
132
133
|
opt.on("-h", "--help", "Display this screen.") do
|
data/lib/miga/daemon.rb
CHANGED
@@ -12,11 +12,11 @@ class MiGA::Daemon < MiGA::MiGA
|
|
12
12
|
|
13
13
|
##
|
14
14
|
# When was the last time a daemon for the MiGA::Project +project+ was seen
|
15
|
-
# active? Returns
|
15
|
+
# active? Returns Time.
|
16
16
|
def self.last_alive(project)
|
17
17
|
f = File.expand_path('daemon/alive', project.path)
|
18
18
|
return nil unless File.exist? f
|
19
|
-
|
19
|
+
Time.parse(File.read(f))
|
20
20
|
end
|
21
21
|
|
22
22
|
# Array of all spawned daemons.
|
@@ -49,7 +49,7 @@ class MiGA::Daemon < MiGA::MiGA
|
|
49
49
|
|
50
50
|
##
|
51
51
|
# When was the last time a daemon for the current project was seen active?
|
52
|
-
# Returns
|
52
|
+
# Returns Time.
|
53
53
|
def last_alive
|
54
54
|
MiGA::Daemon.last_alive project
|
55
55
|
end
|
@@ -229,6 +229,10 @@ class MiGA::Daemon < MiGA::MiGA
|
|
229
229
|
@loop_i += 1
|
230
230
|
check_datasets
|
231
231
|
check_project
|
232
|
+
if shutdown_when_done? and jobs_running.size + jobs_to_run.size == 0
|
233
|
+
say 'Nothing else to do, shutting down.'
|
234
|
+
return false
|
235
|
+
end
|
232
236
|
flush!
|
233
237
|
if loop_i==4
|
234
238
|
say 'Housekeeping for sanity'
|
@@ -237,10 +241,6 @@ class MiGA::Daemon < MiGA::MiGA
|
|
237
241
|
end
|
238
242
|
report_status
|
239
243
|
sleep(latency)
|
240
|
-
if shutdown_when_done? and jobs_running.size+jobs_to_run.size == 0
|
241
|
-
say 'Nothing else to do, shutting down.'
|
242
|
-
return false
|
243
|
-
end
|
244
244
|
true
|
245
245
|
end
|
246
246
|
|
data/lib/miga/dataset/result.rb
CHANGED
@@ -215,7 +215,7 @@ module MiGA::Dataset::Result
|
|
215
215
|
r.clean! if opts[:is_clean]
|
216
216
|
unless r.clean?
|
217
217
|
MiGA::MiGA.clean_fasta_file(r.file_path :proteins)
|
218
|
-
MiGA::MiGA.clean_fasta_file(r.file_path :genes)
|
218
|
+
MiGA::MiGA.clean_fasta_file(r.file_path :genes) if r.file_path :genes
|
219
219
|
r.clean!
|
220
220
|
end
|
221
221
|
r
|
@@ -14,13 +14,15 @@ end
|
|
14
14
|
module MiGA::RemoteDataset::Base
|
15
15
|
|
16
16
|
@@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
17
|
+
@@_NCBI_API_KEY = lambda { |url|
|
18
|
+
ENV['NCBI_API_KEY'].nil? ? url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
|
17
19
|
|
18
20
|
##
|
19
21
|
# Structure of the different database Universes or containers. The structure
|
20
22
|
# is a Hash with universe names as keys as Symbol and values being a Hash with
|
21
23
|
# supported keys as Symbol:
|
22
24
|
# - +:dbs+ => Hash with keys being the database name and the values a Hash of
|
23
|
-
# properties such as +stage+, +format+, and +
|
25
|
+
# properties such as +stage+, +format+, +map_to+, and +getter+.
|
24
26
|
# - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
|
25
27
|
# is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
|
26
28
|
# Additional parameters can be passed to certain functions using the +extra+
|
@@ -37,21 +39,23 @@ module MiGA::RemoteDataset::Base
|
|
37
39
|
assembly_gz: {stage: :assembly, format: :fasta_gz},
|
38
40
|
text: {stage: :metadata, format: :text}
|
39
41
|
},
|
40
|
-
url:
|
42
|
+
url: '%2$s',
|
41
43
|
method: :net
|
42
44
|
},
|
43
45
|
ebi: {
|
44
46
|
dbs: { embl: {stage: :assembly, format: :fasta} },
|
45
|
-
url:
|
47
|
+
url: 'https://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s',
|
46
48
|
method: :rest
|
47
49
|
},
|
48
50
|
ncbi: {
|
49
|
-
dbs: {
|
51
|
+
dbs: {
|
52
|
+
nuccore: { stage: :assembly, format: :fasta },
|
53
|
+
assembly: { stage: :assembly, format: :fasta_gz, getter: :ncbi_asm },
|
54
|
+
taxonomy: { stage: :metadata, format: :xml }
|
55
|
+
},
|
50
56
|
url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
|
51
57
|
method: :rest,
|
52
|
-
api_key:
|
53
|
-
ENV['NCBI_API_KEY'].nil? ?
|
54
|
-
url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
|
58
|
+
api_key: @@_NCBI_API_KEY
|
55
59
|
},
|
56
60
|
ncbi_map: {
|
57
61
|
dbs: {
|
@@ -62,9 +66,19 @@ module MiGA::RemoteDataset::Base
|
|
62
66
|
url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
|
63
67
|
method: :net,
|
64
68
|
map_to_universe: :ncbi,
|
65
|
-
api_key:
|
66
|
-
|
67
|
-
|
69
|
+
api_key: @@_NCBI_API_KEY
|
70
|
+
},
|
71
|
+
ncbi_summary: {
|
72
|
+
dbs: { assembly: { stage: :metadata, format: :json } },
|
73
|
+
url: "#{@@_EUTILS}esummary.fcgi?db=%1$s&id=%2$s&retmode=%3$s",
|
74
|
+
method: :rest,
|
75
|
+
api_key: @@_NCBI_API_KEY
|
76
|
+
},
|
77
|
+
ncbi_search: {
|
78
|
+
dbs: { assembly: { stage: :metadata, format: :json } },
|
79
|
+
url: "#{@@_EUTILS}esearch.fcgi?db=%1$s&term=%2$s&retmode=%3$s",
|
80
|
+
method: :rest,
|
81
|
+
api_key: @@_NCBI_API_KEY
|
68
82
|
}
|
69
83
|
}
|
70
84
|
|
@@ -10,15 +10,22 @@ class MiGA::RemoteDataset
|
|
10
10
|
# Download data from the +universe+ in the database +db+ with IDs +ids+ and
|
11
11
|
# in +format+. If passed, it saves the result in +file+. Additional
|
12
12
|
# parameters specific to the download method can be passed using +extra+.
|
13
|
-
# Returns String.
|
14
|
-
|
13
|
+
# Returns String. The +obj+ can also be passed as MiGA::RemoteDataset or
|
14
|
+
# MiGA::Dataset.
|
15
|
+
def download(universe, db, ids, format, file = nil, extra = [], obj = nil)
|
15
16
|
ids = [ids] unless ids.is_a? Array
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
17
|
+
getter = @@UNIVERSE[universe][:dbs][db][:getter] || :download
|
18
|
+
method = @@UNIVERSE[universe][:method]
|
19
|
+
opts = {
|
20
|
+
universe: universe,
|
21
|
+
db: db,
|
22
|
+
ids: ids,
|
23
|
+
format: format,
|
24
|
+
file: file,
|
25
|
+
extra: extra,
|
26
|
+
obj: obj
|
27
|
+
}
|
28
|
+
doc = send("#{getter}_#{method}", opts)
|
22
29
|
unless file.nil?
|
23
30
|
ofh = File.open(file, 'w')
|
24
31
|
ofh.print doc
|
@@ -28,20 +35,37 @@ class MiGA::RemoteDataset
|
|
28
35
|
end
|
29
36
|
|
30
37
|
##
|
31
|
-
# Download data
|
32
|
-
#
|
33
|
-
#
|
34
|
-
|
35
|
-
|
36
|
-
|
38
|
+
# Download data from NCBI Assembly database using the REST method.
|
39
|
+
# Supported +opts+ (Hash) include:
|
40
|
+
# +obj+ (mandatory): MiGA::RemoteDataset
|
41
|
+
# +ids+ (mandatory): String or Array of String
|
42
|
+
# +file+: String, passed to download
|
43
|
+
# +extra+: Array, passed to download
|
44
|
+
# +format+: String, passed to download
|
45
|
+
def ncbi_asm_rest(opts)
|
46
|
+
url_dir = opts[:obj].ncbi_asm_json_doc['ftppath_genbank']
|
47
|
+
url = "#{url_dir}/#{File.basename url_dir}_genomic.fna.gz"
|
48
|
+
download(:web, :assembly_gz, url,
|
49
|
+
opts[:format], opts[:file], opts[:extra], opts[:obj])
|
50
|
+
end
|
51
|
+
|
52
|
+
##
|
53
|
+
# Download data using the REST method. Supported +opts+ (Hash) include:
|
54
|
+
# +universe+ (mandatory): Symbol
|
55
|
+
# +db+ (mandatory): Symbol
|
56
|
+
# +ids+ (mandatory): Array of String
|
57
|
+
# +format+: String
|
58
|
+
# +extra+: Array
|
59
|
+
def download_rest(opts)
|
60
|
+
u = @@UNIVERSE[opts[:universe]]
|
61
|
+
url = sprintf(u[:url],
|
62
|
+
opts[:db], opts[:ids].join(','), opts[:format], *opts[:extra])
|
37
63
|
url = u[:api_key][url] unless u[:api_key].nil?
|
38
64
|
download_url url
|
39
65
|
end
|
40
66
|
|
41
67
|
##
|
42
|
-
#
|
43
|
-
# with IDs +ids+ and in +format+. Additional URL parameters can be passed
|
44
|
-
# using +extra+. Returns the doc as String.
|
68
|
+
# Alias of download_rest
|
45
69
|
alias download_net download_rest
|
46
70
|
|
47
71
|
##
|
@@ -51,6 +75,7 @@ class MiGA::RemoteDataset
|
|
51
75
|
doc = ''
|
52
76
|
@timeout_try = 0
|
53
77
|
begin
|
78
|
+
DEBUG 'GET: ' + url
|
54
79
|
open(url, read_timeout: 600) { |f| doc = f.read }
|
55
80
|
rescue => e
|
56
81
|
@timeout_try += 1
|
@@ -82,6 +107,6 @@ module MiGA::RemoteDataset::Download
|
|
82
107
|
# Download data into +file+.
|
83
108
|
def download(file)
|
84
109
|
self.class.download(universe, db, ids,
|
85
|
-
self.class.UNIVERSE[universe][:dbs][db][:format], file)
|
110
|
+
self.class.UNIVERSE[universe][:dbs][db][:format], file, [], self)
|
86
111
|
end
|
87
112
|
end
|
data/lib/miga/remote_dataset.rb
CHANGED
@@ -8,6 +8,16 @@ require 'miga/remote_dataset/download'
|
|
8
8
|
class MiGA::RemoteDataset < MiGA::MiGA
|
9
9
|
include MiGA::RemoteDataset::Download
|
10
10
|
|
11
|
+
# Class-level
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def ncbi_asm_acc2id(acc)
|
15
|
+
return acc if acc =~ /^\d+$/
|
16
|
+
search_doc = JSON.parse download(:ncbi_search, :assembly, acc, :json)
|
17
|
+
search_doc['esearchresult']['idlist'].first
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
11
21
|
# Instance-level
|
12
22
|
|
13
23
|
##
|
@@ -19,6 +29,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
19
29
|
attr_reader :ids
|
20
30
|
# Internal metadata hash
|
21
31
|
attr_reader :metadata
|
32
|
+
# NCBI Assembly XML document
|
33
|
+
@_ncbi_asm_xml_doc = nil
|
22
34
|
|
23
35
|
##
|
24
36
|
# Initialize MiGA::RemoteDataset with +ids+ in database +db+ from +universe+.
|
@@ -33,6 +45,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
33
45
|
raise "Unknown Universe: #{@universe}. Try: #{@@UNIVERSE.keys}"
|
34
46
|
@@UNIVERSE[@universe][:dbs].include?(@db) or
|
35
47
|
raise "Unknown Database: #{@db}. Try: #{@@UNIVERSE[@universe][:dbs]}"
|
48
|
+
@_ncbi_asm_json_doc = nil
|
36
49
|
# FIXME: Part of the +map_to+ support:
|
37
50
|
# unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
|
38
51
|
# MiGA::RemoteDataset.download
|
@@ -87,7 +100,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
87
100
|
##
|
88
101
|
# Get NCBI Taxonomy ID.
|
89
102
|
def get_ncbi_taxid
|
90
|
-
|
103
|
+
origin = (universe == :ncbi and db == :assembly) ? :web : universe
|
104
|
+
send("get_ncbi_taxid_from_#{origin}")
|
91
105
|
end
|
92
106
|
|
93
107
|
##
|
@@ -107,6 +121,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
107
121
|
# Get NCBI taxonomy as MiGA::Taxonomy.
|
108
122
|
def get_ncbi_taxonomy
|
109
123
|
tax_id = get_ncbi_taxid
|
124
|
+
return nil if tax_id.nil?
|
110
125
|
lineage = {}
|
111
126
|
doc = MiGA::RemoteDataset.download(:ncbi, :taxonomy, tax_id, :xml)
|
112
127
|
doc.scan(%r{<Taxon>(.*?)</Taxon>}m).map(&:first).each do |i|
|
@@ -119,15 +134,24 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
119
134
|
MiGA::Taxonomy.new(lineage)
|
120
135
|
end
|
121
136
|
|
137
|
+
##
|
138
|
+
# Get the JSON document describing an NCBI assembly entry.
|
139
|
+
def ncbi_asm_json_doc
|
140
|
+
return @_ncbi_asm_json_doc unless @_ncbi_asm_json_doc.nil?
|
141
|
+
metadata[:ncbi_asm] ||= ids.first if universe == :ncbi and db == :assembly
|
142
|
+
return nil unless metadata[:ncbi_asm]
|
143
|
+
ncbi_asm_id = self.class.ncbi_asm_acc2id metadata[:ncbi_asm]
|
144
|
+
doc = JSON.parse(
|
145
|
+
self.class.download(:ncbi_summary, :assembly, ncbi_asm_id, :json))
|
146
|
+
@_ncbi_asm_json_doc = doc['result'][ doc['result']['uids'].first ]
|
147
|
+
end
|
148
|
+
|
149
|
+
|
122
150
|
private
|
123
151
|
|
124
152
|
def get_ncbi_taxid_from_web
|
125
|
-
return nil
|
126
|
-
|
127
|
-
doc = self.class.download_url(
|
128
|
-
"#{base_url}/#{metadata[:ncbi_asm]}?report=xml&format=text")
|
129
|
-
taxid = doc.scan(%r{<Taxid>(\S+)</Taxid>}).first
|
130
|
-
taxid.nil? ? taxid : taxid.first
|
153
|
+
return nil if ncbi_asm_json_doc.nil?
|
154
|
+
ncbi_asm_json_doc['taxid']
|
131
155
|
end
|
132
156
|
|
133
157
|
def get_ncbi_taxid_from_ncbi
|
@@ -154,29 +178,28 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
154
178
|
biosample = self.class.ncbi_map(metadata[:ncbi_nuccore],
|
155
179
|
:nuccore, :biosample)
|
156
180
|
return metadata if biosample.nil?
|
157
|
-
asm = self.class.ncbi_map(biosample,
|
158
|
-
:biosample, :assembly)
|
181
|
+
asm = self.class.ncbi_map(biosample, :biosample, :assembly)
|
159
182
|
metadata[:ncbi_asm] = asm.to_s unless asm.nil?
|
160
183
|
get_type_status_ncbi_asm metadata
|
161
184
|
end
|
162
185
|
|
163
186
|
def get_type_status_ncbi_asm(metadata)
|
164
|
-
return metadata if
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
187
|
+
return metadata if ncbi_asm_json_doc.nil?
|
188
|
+
from_type = ncbi_asm_json_doc['from_type']
|
189
|
+
from_type = ncbi_asm_json_doc['fromtype'] if from_type.nil?
|
190
|
+
case from_type
|
191
|
+
when nil
|
192
|
+
# Do nothing
|
193
|
+
when ''
|
170
194
|
metadata[:is_type] = false
|
171
195
|
metadata[:is_ref_type] = false
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
metadata[:type_rel] = $1
|
196
|
+
when 'assembly from reference material'
|
197
|
+
metadata[:is_type] = false
|
198
|
+
metadata[:is_ref_type] = true
|
199
|
+
metadata[:type_rel] = from_type
|
200
|
+
else
|
201
|
+
metadata[:is_type] = true
|
202
|
+
metadata[:type_rel] = from_type
|
180
203
|
end
|
181
204
|
metadata
|
182
205
|
end
|
data/lib/miga/result/dates.rb
CHANGED
@@ -7,14 +7,14 @@ module MiGA::Result::Dates
|
|
7
7
|
include MiGA::Result::Base
|
8
8
|
|
9
9
|
##
|
10
|
-
# Returns the start date of processing as
|
10
|
+
# Returns the start date of processing as Time or +nil+ if it doesn't
|
11
11
|
# exist.
|
12
12
|
def started_at
|
13
13
|
date_at :start
|
14
14
|
end
|
15
15
|
|
16
16
|
##
|
17
|
-
# Returns the end (done) date of processing as
|
17
|
+
# Returns the end (done) date of processing as Time or +nil+ if it doesn't
|
18
18
|
# exist.
|
19
19
|
def done_at
|
20
20
|
date_at :done
|
@@ -38,7 +38,7 @@ module MiGA::Result::Dates
|
|
38
38
|
f = path event
|
39
39
|
date = File.read(f) if File.size? f
|
40
40
|
end
|
41
|
-
date.nil? ? nil :
|
41
|
+
date.nil? ? nil : Time.parse(date)
|
42
42
|
end
|
43
43
|
end
|
44
44
|
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3,
|
13
|
+
VERSION = [0.3, 8, 0]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -18,7 +18,7 @@ module MiGA
|
|
18
18
|
|
19
19
|
##
|
20
20
|
# Date of the current gem release.
|
21
|
-
VERSION_DATE = Date.
|
21
|
+
VERSION_DATE = Date.new(2019, 02, 28)
|
22
22
|
|
23
23
|
##
|
24
24
|
# Reference of MiGA.
|
data/test/daemon_test.rb
CHANGED
@@ -55,7 +55,7 @@ class DaemonTest < Test::Unit::TestCase
|
|
55
55
|
out = capture_stdout do
|
56
56
|
d.in_loop
|
57
57
|
end
|
58
|
-
assert_equal(
|
58
|
+
assert_equal(Time, d.last_alive.class)
|
59
59
|
assert(out.string =~ /-{20}\n.*MiGA:#{p.name} launched/)
|
60
60
|
2.times{ d.in_loop }
|
61
61
|
assert_equal(3, d.loop_i)
|
@@ -96,7 +96,7 @@ class DaemonTest < Test::Unit::TestCase
|
|
96
96
|
d = MiGA::Daemon.new(p)
|
97
97
|
assert_nil(d.last_alive)
|
98
98
|
d.declare_alive
|
99
|
-
assert(d.last_alive -
|
99
|
+
assert(d.last_alive - Time.now < 1)
|
100
100
|
end
|
101
101
|
|
102
102
|
def test_options
|
data/utils/distance/database.rb
CHANGED
@@ -68,7 +68,7 @@ module MiGA::DistanceRunner::Database
|
|
68
68
|
if dataset.is_ref? and project.path == ref_project.path
|
69
69
|
y = data_from_db(
|
70
70
|
target.name, dataset.name, ref_db(metric, target.name), metric)
|
71
|
-
unless y.nil? or y.first.zero?
|
71
|
+
unless y.nil? or y.first.nil? or y.first.zero?
|
72
72
|
# Store a copy
|
73
73
|
data_to_db(dataset.name, target.name, tmp_dbs[metric], metric, y)
|
74
74
|
return y.first
|
data/utils/subclades.R
CHANGED
@@ -48,12 +48,18 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
48
48
|
ani.types <- a[,2]
|
49
49
|
names(ani.types) <- a[,1]
|
50
50
|
if(length(ani.d) == 0) load(dist_rdata)
|
51
|
-
}else{
|
51
|
+
}else if(length(labels(ani.d)) > 8L){
|
52
52
|
res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
|
53
53
|
if(length(res) == 0) return(NULL)
|
54
54
|
ani.medoids <- res[['ani.medoids']]
|
55
55
|
ani.types <- res[['ani.types']]
|
56
56
|
ani.d <- res[['ani.d']]
|
57
|
+
}else{
|
58
|
+
ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
|
59
|
+
ani.types <- rep(1, length(labels(ani.d)))
|
60
|
+
names(ani.types) <- labels(ani.d)
|
61
|
+
generate_empty_files(out_base)
|
62
|
+
write_text_report(out_base, ani.d, ani.medoids, ani.types)
|
57
63
|
}
|
58
64
|
|
59
65
|
# Recursive search
|
@@ -136,16 +142,7 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
|
|
136
142
|
dev.off()
|
137
143
|
|
138
144
|
# Save results
|
139
|
-
|
140
|
-
write.table(ani.medoids, paste(out_base, "medoids", sep="."),
|
141
|
-
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
142
|
-
classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
|
143
|
-
ani.d.m <- 100 - as.matrix(ani.d)*100
|
144
|
-
for(j in 1:nrow(classif)){
|
145
|
-
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
146
|
-
}
|
147
|
-
write.table(classif, paste(out_base,"classif",sep="."),
|
148
|
-
quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
|
145
|
+
write_text_report(out_base, ani.d, ani.medoids, ani.types)
|
149
146
|
|
150
147
|
# Return data
|
151
148
|
say("Cluster ready")
|
@@ -168,6 +165,19 @@ generate_empty_files <- function(out_base) {
|
|
168
165
|
file.create(paste(out_base,".1.medoids",sep=""))
|
169
166
|
}
|
170
167
|
|
168
|
+
write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
|
169
|
+
say("Text report")
|
170
|
+
write.table(ani.medoids, paste(out_base, "medoids", sep="."),
|
171
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
172
|
+
classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
|
173
|
+
ani.d.m <- 100 - as.matrix(ani.d)*100
|
174
|
+
for(j in 1:nrow(classif)){
|
175
|
+
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
176
|
+
}
|
177
|
+
write.table(classif, paste(out_base,"classif",sep="."),
|
178
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
|
179
|
+
}
|
180
|
+
|
171
181
|
plot_silhouette <- function(k, s, ns, ds, top.n) {
|
172
182
|
# s
|
173
183
|
par(mar=c(4,5,1,5)+0.1)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02-
|
11
|
+
date: 2019-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|