miga-base 0.3.7.1 → 0.3.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +4 -3
- data/actions/init.rb +3 -3
- data/actions/ncbi_get.rb +82 -87
- data/bin/miga +2 -1
- data/lib/miga/daemon.rb +7 -7
- data/lib/miga/dataset/result.rb +1 -1
- data/lib/miga/remote_dataset/base.rb +24 -10
- data/lib/miga/remote_dataset/download.rb +43 -18
- data/lib/miga/remote_dataset.rb +46 -23
- data/lib/miga/result/dates.rb +3 -3
- data/lib/miga/version.rb +2 -2
- data/test/daemon_test.rb +2 -2
- data/utils/distance/database.rb +1 -1
- data/utils/subclades.R +21 -11
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c7f7bb9ba42ccdbed81ce05484031e17a43c6e688e89c5622327aadbee9d0f31
|
4
|
+
data.tar.gz: 3ca5e3189bb65b213fe43a948710d69ea304dac4c838c176838773fc87a88c5b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9fa0deb9770be85a71145fcedd3a8a590ad7a6584f274245ed47df7decde6ddcb8d48c635585773a7005ceca4b03d37e67620346faff3eea387803d113a726f8
|
7
|
+
data.tar.gz: 17eeaad6dc985ff89d813ece94d3e2837fac4a40223cde14063b3236ba5dbd7ff581c6e7df30a4919cb002dd6918a42ac820cf8a3045f99714486f05bbc252fe
|
data/README.md
CHANGED
@@ -11,9 +11,10 @@
|
|
11
11
|
|
12
12
|
For additional information on MiGA, visit:
|
13
13
|
|
14
|
+
* [MiGA Online][miga-online]: The Microbial Genomes Atlas Online.
|
14
15
|
* [MiGA users list][mailing-list]:
|
15
16
|
Forum to discuss with other users and developers.
|
16
|
-
* [MiGA manual][
|
17
|
+
* [MiGA manual][manual]: The definitive guide to MiGA.
|
17
18
|
* [MiGA API docs][rubydoc]: Inner-workings of the `miga-base` gem.
|
18
19
|
* [MiGA Web][miga-web]: MiGA on Rails!
|
19
20
|
|
@@ -46,8 +47,8 @@ Technology and [RDP][rdp] at Michigan State University.
|
|
46
47
|
See [LICENSE](LICENSE).
|
47
48
|
|
48
49
|
[lrr]: http://lmrodriguezr.github.io/
|
49
|
-
[mailing-list]:
|
50
|
-
[
|
50
|
+
[mailing-list]: http://support.microbial-genomes.org/
|
51
|
+
[manual]: https://manual.microbial-genomes.org/
|
51
52
|
[rubydoc]: http://www.rubydoc.info/github/bio-miga/miga
|
52
53
|
[contact]: http://enve-omics.gatech.edu/node/7
|
53
54
|
[miga-web]: https://github.com/bio-miga/miga-web
|
data/actions/init.rb
CHANGED
@@ -229,7 +229,7 @@ unless File.exist?(daemon_f) and ask_user(
|
|
229
229
|
v[:latency] = ask_user(
|
230
230
|
'How long should I sleep? (in seconds)', '150').to_i
|
231
231
|
v[:maxjobs] = ask_user('How many jobs can I launch at once?', '300').to_i
|
232
|
-
v[:ppn] = ask_user('How many CPUs can I use per job?', '
|
232
|
+
v[:ppn] = ask_user('How many CPUs can I use per job?', '2').to_i
|
233
233
|
$stderr.puts 'Setting up internal daemon defaults.'
|
234
234
|
$stderr.puts 'If you don\'t understand this just leave default values:'
|
235
235
|
v[:cmd] = ask_user(
|
@@ -245,7 +245,7 @@ unless File.exist?(daemon_f) and ask_user(
|
|
245
245
|
v[:alive] = ask_user(
|
246
246
|
"How can I know that a process is still alive?\n %1$s: job id, " +
|
247
247
|
"output should be 1 for running and 0 for non-running.\n",
|
248
|
-
"squeue -h -o
|
248
|
+
"squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " +
|
249
249
|
"| tail -n 1 | wc -l")
|
250
250
|
v[:kill] = ask_user(
|
251
251
|
"How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
|
@@ -254,7 +254,7 @@ unless File.exist?(daemon_f) and ask_user(
|
|
254
254
|
v[:latency] = ask_user(
|
255
255
|
'How long should I sleep? (in seconds)', '150').to_i
|
256
256
|
v[:maxjobs] = ask_user('How many jobs can I launch at once?', '300').to_i
|
257
|
-
v[:ppn] = ask_user('How many CPUs can I use per job?', '
|
257
|
+
v[:ppn] = ask_user('How many CPUs can I use per job?', '2').to_i
|
258
258
|
$stderr.puts 'Setting up internal daemon defaults.'
|
259
259
|
$stderr.puts 'If you don\'t understand this just leave default values:'
|
260
260
|
v[:cmd] = ask_user(
|
data/actions/ncbi_get.rb
CHANGED
@@ -4,26 +4,25 @@
|
|
4
4
|
# @license Artistic-2.0
|
5
5
|
|
6
6
|
require 'miga/remote_dataset'
|
7
|
+
require 'csv'
|
7
8
|
|
8
9
|
o = {q:true, query:false, unlink:false,
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
reference: false, legacy_name: false,
|
11
|
+
complete: false, chromosome: false,
|
12
|
+
scaffold: false, contig: false, add_version: true, dry: false,
|
13
|
+
get_md: false}
|
13
14
|
OptionParser.new do |opt|
|
14
15
|
opt_banner(opt)
|
15
16
|
opt_object(opt, o, [:project])
|
16
17
|
opt.on('-T', '--taxon STRING',
|
17
|
-
|
18
|
-
|
18
|
+
'(Mandatory unless --reference) Taxon name (e.g., a species binomial).'
|
19
|
+
){ |v| o[:taxon]=v }
|
19
20
|
opt.on('--reference',
|
20
|
-
|
21
|
-
|
22
|
-
'If passed, ignores plasmids (only for --reference).'
|
23
|
-
){ |v| o[:ignore_plasmids]=v }
|
21
|
+
'Download all reference genomes (ignores any other status).'
|
22
|
+
){ |v| o[:reference]=v }
|
24
23
|
opt.on('--complete', 'Download complete genomes.'){ |v| o[:complete]=v }
|
25
24
|
opt.on('--chromosome',
|
26
|
-
|
25
|
+
'Download complete chromosomes.'){ |v| o[:chromosome]=v }
|
27
26
|
opt.on('--scaffold', 'Download genomes in scaffolds.'){ |v| o[:scaffold]=v }
|
28
27
|
opt.on('--contig', 'Download genomes in contigs.'){ |v| o[:contig]=v }
|
29
28
|
opt.on('--all', 'Download all genomes (in any status).') do
|
@@ -33,23 +32,26 @@ OptionParser.new do |opt|
|
|
33
32
|
o[:contig] = true
|
34
33
|
end
|
35
34
|
opt.on('--no-version-name',
|
36
|
-
|
37
|
-
|
35
|
+
'Do not add sequence version to the dataset name.',
|
36
|
+
'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
|
37
|
+
opt.on('--legacy-name',
|
38
|
+
'Use dataset names based on chromosome entries instead of assembly.'
|
39
|
+
){ |v| o[:legacy_name] = v }
|
38
40
|
opt.on('--blacklist PATH',
|
39
|
-
|
41
|
+
'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
|
40
42
|
opt.on('--dry', 'Do not download or save the datasets.'){ |v| o[:dry] = v }
|
41
43
|
opt.on('--get-metadata',
|
42
|
-
|
43
|
-
|
44
|
+
'Only download and update metadata for existing datasets'
|
45
|
+
){ |v| o[:get_md] = v }
|
44
46
|
opt.on('-q', '--query',
|
45
|
-
|
46
|
-
|
47
|
+
'Register the datasets as queries, not reference datasets.'
|
48
|
+
){ |v| o[:query]=v }
|
47
49
|
opt.on('-u', '--unlink',
|
48
|
-
|
49
|
-
|
50
|
+
'Unlink all datasets in the project missing from the download list.'
|
51
|
+
){ |v| o[:unlink]=v }
|
50
52
|
opt.on('-R', '--remote-list PATH',
|
51
|
-
|
52
|
-
|
53
|
+
'Path to an output file with the list of all datasets listed remotely.'
|
54
|
+
){ |v| o[:remote_list]=v }
|
53
55
|
opt.on('--api-key STRING', 'NCBI API key.'){ |v| ENV['NCBI_API_KEY'] = v }
|
54
56
|
opt_common(opt, o)
|
55
57
|
end.parse!
|
@@ -68,85 +70,78 @@ d = []
|
|
68
70
|
ds = {}
|
69
71
|
downloaded = 0
|
70
72
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
end
|
83
|
-
|
84
|
-
# Download IDs with reference status
|
73
|
+
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
|
74
|
+
url_param = {
|
75
|
+
q: '[display()].' +
|
76
|
+
'from(GenomeAssemblies).' +
|
77
|
+
'usingschema(/schema/GenomeAssemblies).' +
|
78
|
+
'matching(tab==["Prokaryotes"] and q=="' + o[:taxon].tr('"',"'") + '"',
|
79
|
+
fields: 'organism|organism,assembly|assembly,replicons|replicons,' +
|
80
|
+
'level|level,ftp_path_genbank|ftp_path_genbank,release_date|release_date,' +
|
81
|
+
'strain|strain',
|
82
|
+
nolimit: 'on',
|
83
|
+
}
|
85
84
|
if o[:reference]
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
next if ids.empty?
|
96
|
-
n = r[2].miga_name
|
97
|
-
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
98
|
-
end
|
85
|
+
url_param[:q] += ' and refseq_category==["representative"]'
|
86
|
+
else
|
87
|
+
status = {
|
88
|
+
complete: 'Complete',
|
89
|
+
chromosome: ' Chromosome', # <- The leading space is *VERY* important!
|
90
|
+
scaffold: 'Scaffold',
|
91
|
+
contig: 'Contig'
|
92
|
+
}.map { |k, v| '"' + v + '"' if o[k] }.compact.join(',')
|
93
|
+
url_param[:q] += ' and level==[' + status + ']'
|
99
94
|
end
|
95
|
+
url_param[:q] += ')'
|
96
|
+
url = url_base + URI.encode_www_form(url_param)
|
97
|
+
$stderr.puts 'Downloading genome list' unless o[:q]
|
98
|
+
lineno = 0
|
99
|
+
doc = MiGA::RemoteDataset.download_url(url)
|
100
|
+
CSV.parse(doc, headers: true).each do |r|
|
101
|
+
asm = r['assembly']
|
102
|
+
next if asm.nil? or asm.empty? or asm == '-'
|
100
103
|
|
101
|
-
#
|
102
|
-
|
103
|
-
|
104
|
-
'50|40' : o[:complete] ? '50' : '40')
|
105
|
-
$stderr.puts 'Downloading complete/chromosome genomes' unless o[:q]
|
106
|
-
lineno = 0
|
107
|
-
get_list(o[:taxon], status).each_line do |ln|
|
108
|
-
next if (lineno+=1)==1
|
109
|
-
r = ln.chomp.split("\t")
|
110
|
-
next if r[10].nil? or r[10].empty?
|
111
|
-
ids = r[10].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').split(';')
|
112
|
-
ids.delete_if{ |i| i =~ /\A\-*\z/ }
|
113
|
-
next if ids.empty?
|
114
|
-
acc = o[:add_version] ? ids[0] : ids[0].gsub(/\.\d+\Z/,'')
|
115
|
-
n = "#{r[0]}_#{acc}".miga_name
|
116
|
-
ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
|
117
|
-
end
|
118
|
-
end
|
104
|
+
# Get replicons
|
105
|
+
rep = r['replicons'].nil? ? nil : r['replicons'].
|
106
|
+
split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
|
119
107
|
|
120
|
-
#
|
121
|
-
if o[:
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
ids = r[19].gsub(/\s/,'').split(';').delete_if{ |i| i =~ /\A\-*\z/ }.
|
132
|
-
map{ |i| "#{i}/#{File.basename(i)}_genomic.fna.gz" }
|
133
|
-
next if ids.empty?
|
134
|
-
n = "#{r[0]}_#{asm}".miga_name
|
135
|
-
asm.gsub!(/\(.*\)/, '')
|
136
|
-
ds[n] = {ids: ids, md: {type: :genome, ncbi_asm: asm},
|
137
|
-
db: :assembly_gz, universe: :web}
|
108
|
+
# Set name
|
109
|
+
if o[:legacy_name] and o[:reference]
|
110
|
+
n = r['#organism'].miga_name
|
111
|
+
else
|
112
|
+
if o[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
|
113
|
+
acc = rep.nil? ? '' : rep.first
|
114
|
+
else
|
115
|
+
acc = asm
|
116
|
+
end
|
117
|
+
acc.gsub!(/\.\d+\Z/, '') unless o[:add_version]
|
118
|
+
n = "#{r['#organism']}_#{acc}".miga_name
|
138
119
|
end
|
120
|
+
|
121
|
+
# Register for download
|
122
|
+
fna_url = r['ftp_path_genbank'] + '/' +
|
123
|
+
File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
|
124
|
+
ds[n] = {
|
125
|
+
ids: [fna_url], db: :assembly_gz, universe: :web,
|
126
|
+
md: {
|
127
|
+
type: :genome, ncbi_asm: asm, strain: r['strain']
|
128
|
+
}
|
129
|
+
}
|
130
|
+
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
131
|
+
ds[n][:md][:release_date] =
|
132
|
+
Time.parse(r['release_date']).to_s unless r['release_date'].nil?
|
139
133
|
end
|
140
134
|
|
141
135
|
# Discard blacklisted
|
142
136
|
unless o[:blacklist].nil?
|
143
137
|
$stderr.puts "Discarding datasets in #{o[:blacklist]}." unless o[:q]
|
144
|
-
File.readlines(o[:blacklist]).
|
138
|
+
File.readlines(o[:blacklist]).
|
139
|
+
select{ |i| i !~ /^#/ }.map(&:chomp).each{ |i| ds.delete i }
|
145
140
|
end
|
146
141
|
|
147
142
|
# Download entries
|
148
143
|
$stderr.puts "Downloading #{ds.size} " +
|
149
|
-
(ds.size
|
144
|
+
(ds.size == 1 ? "entry" : "entries") unless o[:q]
|
150
145
|
ds.each do |name,body|
|
151
146
|
d << name
|
152
147
|
puts name
|
data/bin/miga
CHANGED
@@ -126,7 +126,8 @@ end
|
|
126
126
|
def opt_common(opt, o)
|
127
127
|
opt.on("-v", "--verbose",
|
128
128
|
"Print additional information to STDERR."){ o[:q]=false }
|
129
|
-
opt.on("-d", "--debug INT",
|
129
|
+
opt.on("-d", "--debug INT",
|
130
|
+
"Print debugging information to STDERR (1: debug, 2: trace).") do |v|
|
130
131
|
v.to_i>1 ? MiGA::MiGA.DEBUG_TRACE_ON : MiGA::MiGA.DEBUG_ON
|
131
132
|
end
|
132
133
|
opt.on("-h", "--help", "Display this screen.") do
|
data/lib/miga/daemon.rb
CHANGED
@@ -12,11 +12,11 @@ class MiGA::Daemon < MiGA::MiGA
|
|
12
12
|
|
13
13
|
##
|
14
14
|
# When was the last time a daemon for the MiGA::Project +project+ was seen
|
15
|
-
# active? Returns
|
15
|
+
# active? Returns Time.
|
16
16
|
def self.last_alive(project)
|
17
17
|
f = File.expand_path('daemon/alive', project.path)
|
18
18
|
return nil unless File.exist? f
|
19
|
-
|
19
|
+
Time.parse(File.read(f))
|
20
20
|
end
|
21
21
|
|
22
22
|
# Array of all spawned daemons.
|
@@ -49,7 +49,7 @@ class MiGA::Daemon < MiGA::MiGA
|
|
49
49
|
|
50
50
|
##
|
51
51
|
# When was the last time a daemon for the current project was seen active?
|
52
|
-
# Returns
|
52
|
+
# Returns Time.
|
53
53
|
def last_alive
|
54
54
|
MiGA::Daemon.last_alive project
|
55
55
|
end
|
@@ -229,6 +229,10 @@ class MiGA::Daemon < MiGA::MiGA
|
|
229
229
|
@loop_i += 1
|
230
230
|
check_datasets
|
231
231
|
check_project
|
232
|
+
if shutdown_when_done? and jobs_running.size + jobs_to_run.size == 0
|
233
|
+
say 'Nothing else to do, shutting down.'
|
234
|
+
return false
|
235
|
+
end
|
232
236
|
flush!
|
233
237
|
if loop_i==4
|
234
238
|
say 'Housekeeping for sanity'
|
@@ -237,10 +241,6 @@ class MiGA::Daemon < MiGA::MiGA
|
|
237
241
|
end
|
238
242
|
report_status
|
239
243
|
sleep(latency)
|
240
|
-
if shutdown_when_done? and jobs_running.size+jobs_to_run.size == 0
|
241
|
-
say 'Nothing else to do, shutting down.'
|
242
|
-
return false
|
243
|
-
end
|
244
244
|
true
|
245
245
|
end
|
246
246
|
|
data/lib/miga/dataset/result.rb
CHANGED
@@ -215,7 +215,7 @@ module MiGA::Dataset::Result
|
|
215
215
|
r.clean! if opts[:is_clean]
|
216
216
|
unless r.clean?
|
217
217
|
MiGA::MiGA.clean_fasta_file(r.file_path :proteins)
|
218
|
-
MiGA::MiGA.clean_fasta_file(r.file_path :genes)
|
218
|
+
MiGA::MiGA.clean_fasta_file(r.file_path :genes) if r.file_path :genes
|
219
219
|
r.clean!
|
220
220
|
end
|
221
221
|
r
|
@@ -14,13 +14,15 @@ end
|
|
14
14
|
module MiGA::RemoteDataset::Base
|
15
15
|
|
16
16
|
@@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
17
|
+
@@_NCBI_API_KEY = lambda { |url|
|
18
|
+
ENV['NCBI_API_KEY'].nil? ? url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
|
17
19
|
|
18
20
|
##
|
19
21
|
# Structure of the different database Universes or containers. The structure
|
20
22
|
# is a Hash with universe names as keys as Symbol and values being a Hash with
|
21
23
|
# supported keys as Symbol:
|
22
24
|
# - +:dbs+ => Hash with keys being the database name and the values a Hash of
|
23
|
-
# properties such as +stage+, +format+, and +
|
25
|
+
# properties such as +stage+, +format+, +map_to+, and +getter+.
|
24
26
|
# - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
|
25
27
|
# is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
|
26
28
|
# Additional parameters can be passed to certain functions using the +extra+
|
@@ -37,21 +39,23 @@ module MiGA::RemoteDataset::Base
|
|
37
39
|
assembly_gz: {stage: :assembly, format: :fasta_gz},
|
38
40
|
text: {stage: :metadata, format: :text}
|
39
41
|
},
|
40
|
-
url:
|
42
|
+
url: '%2$s',
|
41
43
|
method: :net
|
42
44
|
},
|
43
45
|
ebi: {
|
44
46
|
dbs: { embl: {stage: :assembly, format: :fasta} },
|
45
|
-
url:
|
47
|
+
url: 'https://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s',
|
46
48
|
method: :rest
|
47
49
|
},
|
48
50
|
ncbi: {
|
49
|
-
dbs: {
|
51
|
+
dbs: {
|
52
|
+
nuccore: { stage: :assembly, format: :fasta },
|
53
|
+
assembly: { stage: :assembly, format: :fasta_gz, getter: :ncbi_asm },
|
54
|
+
taxonomy: { stage: :metadata, format: :xml }
|
55
|
+
},
|
50
56
|
url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
|
51
57
|
method: :rest,
|
52
|
-
api_key:
|
53
|
-
ENV['NCBI_API_KEY'].nil? ?
|
54
|
-
url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
|
58
|
+
api_key: @@_NCBI_API_KEY
|
55
59
|
},
|
56
60
|
ncbi_map: {
|
57
61
|
dbs: {
|
@@ -62,9 +66,19 @@ module MiGA::RemoteDataset::Base
|
|
62
66
|
url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
|
63
67
|
method: :net,
|
64
68
|
map_to_universe: :ncbi,
|
65
|
-
api_key:
|
66
|
-
|
67
|
-
|
69
|
+
api_key: @@_NCBI_API_KEY
|
70
|
+
},
|
71
|
+
ncbi_summary: {
|
72
|
+
dbs: { assembly: { stage: :metadata, format: :json } },
|
73
|
+
url: "#{@@_EUTILS}esummary.fcgi?db=%1$s&id=%2$s&retmode=%3$s",
|
74
|
+
method: :rest,
|
75
|
+
api_key: @@_NCBI_API_KEY
|
76
|
+
},
|
77
|
+
ncbi_search: {
|
78
|
+
dbs: { assembly: { stage: :metadata, format: :json } },
|
79
|
+
url: "#{@@_EUTILS}esearch.fcgi?db=%1$s&term=%2$s&retmode=%3$s",
|
80
|
+
method: :rest,
|
81
|
+
api_key: @@_NCBI_API_KEY
|
68
82
|
}
|
69
83
|
}
|
70
84
|
|
@@ -10,15 +10,22 @@ class MiGA::RemoteDataset
|
|
10
10
|
# Download data from the +universe+ in the database +db+ with IDs +ids+ and
|
11
11
|
# in +format+. If passed, it saves the result in +file+. Additional
|
12
12
|
# parameters specific to the download method can be passed using +extra+.
|
13
|
-
# Returns String.
|
14
|
-
|
13
|
+
# Returns String. The +obj+ can also be passed as MiGA::RemoteDataset or
|
14
|
+
# MiGA::Dataset.
|
15
|
+
def download(universe, db, ids, format, file = nil, extra = [], obj = nil)
|
15
16
|
ids = [ids] unless ids.is_a? Array
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
17
|
+
getter = @@UNIVERSE[universe][:dbs][db][:getter] || :download
|
18
|
+
method = @@UNIVERSE[universe][:method]
|
19
|
+
opts = {
|
20
|
+
universe: universe,
|
21
|
+
db: db,
|
22
|
+
ids: ids,
|
23
|
+
format: format,
|
24
|
+
file: file,
|
25
|
+
extra: extra,
|
26
|
+
obj: obj
|
27
|
+
}
|
28
|
+
doc = send("#{getter}_#{method}", opts)
|
22
29
|
unless file.nil?
|
23
30
|
ofh = File.open(file, 'w')
|
24
31
|
ofh.print doc
|
@@ -28,20 +35,37 @@ class MiGA::RemoteDataset
|
|
28
35
|
end
|
29
36
|
|
30
37
|
##
|
31
|
-
# Download data
|
32
|
-
#
|
33
|
-
#
|
34
|
-
|
35
|
-
|
36
|
-
|
38
|
+
# Download data from NCBI Assembly database using the REST method.
|
39
|
+
# Supported +opts+ (Hash) include:
|
40
|
+
# +obj+ (mandatory): MiGA::RemoteDataset
|
41
|
+
# +ids+ (mandatory): String or Array of String
|
42
|
+
# +file+: String, passed to download
|
43
|
+
# +extra+: Array, passed to download
|
44
|
+
# +format+: String, passed to download
|
45
|
+
def ncbi_asm_rest(opts)
|
46
|
+
url_dir = opts[:obj].ncbi_asm_json_doc['ftppath_genbank']
|
47
|
+
url = "#{url_dir}/#{File.basename url_dir}_genomic.fna.gz"
|
48
|
+
download(:web, :assembly_gz, url,
|
49
|
+
opts[:format], opts[:file], opts[:extra], opts[:obj])
|
50
|
+
end
|
51
|
+
|
52
|
+
##
|
53
|
+
# Download data using the REST method. Supported +opts+ (Hash) include:
|
54
|
+
# +universe+ (mandatory): Symbol
|
55
|
+
# +db+ (mandatory): Symbol
|
56
|
+
# +ids+ (mandatory): Array of String
|
57
|
+
# +format+: String
|
58
|
+
# +extra+: Array
|
59
|
+
def download_rest(opts)
|
60
|
+
u = @@UNIVERSE[opts[:universe]]
|
61
|
+
url = sprintf(u[:url],
|
62
|
+
opts[:db], opts[:ids].join(','), opts[:format], *opts[:extra])
|
37
63
|
url = u[:api_key][url] unless u[:api_key].nil?
|
38
64
|
download_url url
|
39
65
|
end
|
40
66
|
|
41
67
|
##
|
42
|
-
#
|
43
|
-
# with IDs +ids+ and in +format+. Additional URL parameters can be passed
|
44
|
-
# using +extra+. Returns the doc as String.
|
68
|
+
# Alias of download_rest
|
45
69
|
alias download_net download_rest
|
46
70
|
|
47
71
|
##
|
@@ -51,6 +75,7 @@ class MiGA::RemoteDataset
|
|
51
75
|
doc = ''
|
52
76
|
@timeout_try = 0
|
53
77
|
begin
|
78
|
+
DEBUG 'GET: ' + url
|
54
79
|
open(url, read_timeout: 600) { |f| doc = f.read }
|
55
80
|
rescue => e
|
56
81
|
@timeout_try += 1
|
@@ -82,6 +107,6 @@ module MiGA::RemoteDataset::Download
|
|
82
107
|
# Download data into +file+.
|
83
108
|
def download(file)
|
84
109
|
self.class.download(universe, db, ids,
|
85
|
-
self.class.UNIVERSE[universe][:dbs][db][:format], file)
|
110
|
+
self.class.UNIVERSE[universe][:dbs][db][:format], file, [], self)
|
86
111
|
end
|
87
112
|
end
|
data/lib/miga/remote_dataset.rb
CHANGED
@@ -8,6 +8,16 @@ require 'miga/remote_dataset/download'
|
|
8
8
|
class MiGA::RemoteDataset < MiGA::MiGA
|
9
9
|
include MiGA::RemoteDataset::Download
|
10
10
|
|
11
|
+
# Class-level
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def ncbi_asm_acc2id(acc)
|
15
|
+
return acc if acc =~ /^\d+$/
|
16
|
+
search_doc = JSON.parse download(:ncbi_search, :assembly, acc, :json)
|
17
|
+
search_doc['esearchresult']['idlist'].first
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
11
21
|
# Instance-level
|
12
22
|
|
13
23
|
##
|
@@ -19,6 +29,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
19
29
|
attr_reader :ids
|
20
30
|
# Internal metadata hash
|
21
31
|
attr_reader :metadata
|
32
|
+
# NCBI Assembly XML document
|
33
|
+
@_ncbi_asm_xml_doc = nil
|
22
34
|
|
23
35
|
##
|
24
36
|
# Initialize MiGA::RemoteDataset with +ids+ in database +db+ from +universe+.
|
@@ -33,6 +45,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
33
45
|
raise "Unknown Universe: #{@universe}. Try: #{@@UNIVERSE.keys}"
|
34
46
|
@@UNIVERSE[@universe][:dbs].include?(@db) or
|
35
47
|
raise "Unknown Database: #{@db}. Try: #{@@UNIVERSE[@universe][:dbs]}"
|
48
|
+
@_ncbi_asm_json_doc = nil
|
36
49
|
# FIXME: Part of the +map_to+ support:
|
37
50
|
# unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
|
38
51
|
# MiGA::RemoteDataset.download
|
@@ -87,7 +100,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
87
100
|
##
|
88
101
|
# Get NCBI Taxonomy ID.
|
89
102
|
def get_ncbi_taxid
|
90
|
-
|
103
|
+
origin = (universe == :ncbi and db == :assembly) ? :web : universe
|
104
|
+
send("get_ncbi_taxid_from_#{origin}")
|
91
105
|
end
|
92
106
|
|
93
107
|
##
|
@@ -107,6 +121,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
107
121
|
# Get NCBI taxonomy as MiGA::Taxonomy.
|
108
122
|
def get_ncbi_taxonomy
|
109
123
|
tax_id = get_ncbi_taxid
|
124
|
+
return nil if tax_id.nil?
|
110
125
|
lineage = {}
|
111
126
|
doc = MiGA::RemoteDataset.download(:ncbi, :taxonomy, tax_id, :xml)
|
112
127
|
doc.scan(%r{<Taxon>(.*?)</Taxon>}m).map(&:first).each do |i|
|
@@ -119,15 +134,24 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
119
134
|
MiGA::Taxonomy.new(lineage)
|
120
135
|
end
|
121
136
|
|
137
|
+
##
|
138
|
+
# Get the JSON document describing an NCBI assembly entry.
|
139
|
+
def ncbi_asm_json_doc
|
140
|
+
return @_ncbi_asm_json_doc unless @_ncbi_asm_json_doc.nil?
|
141
|
+
metadata[:ncbi_asm] ||= ids.first if universe == :ncbi and db == :assembly
|
142
|
+
return nil unless metadata[:ncbi_asm]
|
143
|
+
ncbi_asm_id = self.class.ncbi_asm_acc2id metadata[:ncbi_asm]
|
144
|
+
doc = JSON.parse(
|
145
|
+
self.class.download(:ncbi_summary, :assembly, ncbi_asm_id, :json))
|
146
|
+
@_ncbi_asm_json_doc = doc['result'][ doc['result']['uids'].first ]
|
147
|
+
end
|
148
|
+
|
149
|
+
|
122
150
|
private
|
123
151
|
|
124
152
|
def get_ncbi_taxid_from_web
|
125
|
-
return nil
|
126
|
-
|
127
|
-
doc = self.class.download_url(
|
128
|
-
"#{base_url}/#{metadata[:ncbi_asm]}?report=xml&format=text")
|
129
|
-
taxid = doc.scan(%r{<Taxid>(\S+)</Taxid>}).first
|
130
|
-
taxid.nil? ? taxid : taxid.first
|
153
|
+
return nil if ncbi_asm_json_doc.nil?
|
154
|
+
ncbi_asm_json_doc['taxid']
|
131
155
|
end
|
132
156
|
|
133
157
|
def get_ncbi_taxid_from_ncbi
|
@@ -154,29 +178,28 @@ class MiGA::RemoteDataset < MiGA::MiGA
|
|
154
178
|
biosample = self.class.ncbi_map(metadata[:ncbi_nuccore],
|
155
179
|
:nuccore, :biosample)
|
156
180
|
return metadata if biosample.nil?
|
157
|
-
asm = self.class.ncbi_map(biosample,
|
158
|
-
:biosample, :assembly)
|
181
|
+
asm = self.class.ncbi_map(biosample, :biosample, :assembly)
|
159
182
|
metadata[:ncbi_asm] = asm.to_s unless asm.nil?
|
160
183
|
get_type_status_ncbi_asm metadata
|
161
184
|
end
|
162
185
|
|
163
186
|
def get_type_status_ncbi_asm(metadata)
|
164
|
-
return metadata if
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
187
|
+
return metadata if ncbi_asm_json_doc.nil?
|
188
|
+
from_type = ncbi_asm_json_doc['from_type']
|
189
|
+
from_type = ncbi_asm_json_doc['fromtype'] if from_type.nil?
|
190
|
+
case from_type
|
191
|
+
when nil
|
192
|
+
# Do nothing
|
193
|
+
when ''
|
170
194
|
metadata[:is_type] = false
|
171
195
|
metadata[:is_ref_type] = false
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
metadata[:type_rel] = $1
|
196
|
+
when 'assembly from reference material'
|
197
|
+
metadata[:is_type] = false
|
198
|
+
metadata[:is_ref_type] = true
|
199
|
+
metadata[:type_rel] = from_type
|
200
|
+
else
|
201
|
+
metadata[:is_type] = true
|
202
|
+
metadata[:type_rel] = from_type
|
180
203
|
end
|
181
204
|
metadata
|
182
205
|
end
|
data/lib/miga/result/dates.rb
CHANGED
@@ -7,14 +7,14 @@ module MiGA::Result::Dates
|
|
7
7
|
include MiGA::Result::Base
|
8
8
|
|
9
9
|
##
|
10
|
-
# Returns the start date of processing as
|
10
|
+
# Returns the start date of processing as Time or +nil+ if it doesn't
|
11
11
|
# exist.
|
12
12
|
def started_at
|
13
13
|
date_at :start
|
14
14
|
end
|
15
15
|
|
16
16
|
##
|
17
|
-
# Returns the end (done) date of processing as
|
17
|
+
# Returns the end (done) date of processing as Time or +nil+ if it doesn't
|
18
18
|
# exist.
|
19
19
|
def done_at
|
20
20
|
date_at :done
|
@@ -38,7 +38,7 @@ module MiGA::Result::Dates
|
|
38
38
|
f = path event
|
39
39
|
date = File.read(f) if File.size? f
|
40
40
|
end
|
41
|
-
date.nil? ? nil :
|
41
|
+
date.nil? ? nil : Time.parse(date)
|
42
42
|
end
|
43
43
|
end
|
44
44
|
|
data/lib/miga/version.rb
CHANGED
@@ -10,7 +10,7 @@ module MiGA
|
|
10
10
|
# - Float representing the major.minor version.
|
11
11
|
# - Integer representing gem releases of the current version.
|
12
12
|
# - Integer representing minor changes that require new version number.
|
13
|
-
VERSION = [0.3,
|
13
|
+
VERSION = [0.3, 8, 0]
|
14
14
|
|
15
15
|
##
|
16
16
|
# Nickname for the current major.minor version.
|
@@ -18,7 +18,7 @@ module MiGA
|
|
18
18
|
|
19
19
|
##
|
20
20
|
# Date of the current gem release.
|
21
|
-
VERSION_DATE = Date.
|
21
|
+
VERSION_DATE = Date.new(2019, 02, 28)
|
22
22
|
|
23
23
|
##
|
24
24
|
# Reference of MiGA.
|
data/test/daemon_test.rb
CHANGED
@@ -55,7 +55,7 @@ class DaemonTest < Test::Unit::TestCase
|
|
55
55
|
out = capture_stdout do
|
56
56
|
d.in_loop
|
57
57
|
end
|
58
|
-
assert_equal(
|
58
|
+
assert_equal(Time, d.last_alive.class)
|
59
59
|
assert(out.string =~ /-{20}\n.*MiGA:#{p.name} launched/)
|
60
60
|
2.times{ d.in_loop }
|
61
61
|
assert_equal(3, d.loop_i)
|
@@ -96,7 +96,7 @@ class DaemonTest < Test::Unit::TestCase
|
|
96
96
|
d = MiGA::Daemon.new(p)
|
97
97
|
assert_nil(d.last_alive)
|
98
98
|
d.declare_alive
|
99
|
-
assert(d.last_alive -
|
99
|
+
assert(d.last_alive - Time.now < 1)
|
100
100
|
end
|
101
101
|
|
102
102
|
def test_options
|
data/utils/distance/database.rb
CHANGED
@@ -68,7 +68,7 @@ module MiGA::DistanceRunner::Database
|
|
68
68
|
if dataset.is_ref? and project.path == ref_project.path
|
69
69
|
y = data_from_db(
|
70
70
|
target.name, dataset.name, ref_db(metric, target.name), metric)
|
71
|
-
unless y.nil? or y.first.zero?
|
71
|
+
unless y.nil? or y.first.nil? or y.first.zero?
|
72
72
|
# Store a copy
|
73
73
|
data_to_db(dataset.name, target.name, tmp_dbs[metric], metric, y)
|
74
74
|
return y.first
|
data/utils/subclades.R
CHANGED
@@ -48,12 +48,18 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
|
|
48
48
|
ani.types <- a[,2]
|
49
49
|
names(ani.types) <- a[,1]
|
50
50
|
if(length(ani.d) == 0) load(dist_rdata)
|
51
|
-
}else{
|
51
|
+
}else if(length(labels(ani.d)) > 8L){
|
52
52
|
res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
|
53
53
|
if(length(res) == 0) return(NULL)
|
54
54
|
ani.medoids <- res[['ani.medoids']]
|
55
55
|
ani.types <- res[['ani.types']]
|
56
56
|
ani.d <- res[['ani.d']]
|
57
|
+
}else{
|
58
|
+
ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
|
59
|
+
ani.types <- rep(1, length(labels(ani.d)))
|
60
|
+
names(ani.types) <- labels(ani.d)
|
61
|
+
generate_empty_files(out_base)
|
62
|
+
write_text_report(out_base, ani.d, ani.medoids, ani.types)
|
57
63
|
}
|
58
64
|
|
59
65
|
# Recursive search
|
@@ -136,16 +142,7 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
|
|
136
142
|
dev.off()
|
137
143
|
|
138
144
|
# Save results
|
139
|
-
|
140
|
-
write.table(ani.medoids, paste(out_base, "medoids", sep="."),
|
141
|
-
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
142
|
-
classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
|
143
|
-
ani.d.m <- 100 - as.matrix(ani.d)*100
|
144
|
-
for(j in 1:nrow(classif)){
|
145
|
-
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
146
|
-
}
|
147
|
-
write.table(classif, paste(out_base,"classif",sep="."),
|
148
|
-
quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
|
145
|
+
write_text_report(out_base, ani.d, ani.medoids, ani.types)
|
149
146
|
|
150
147
|
# Return data
|
151
148
|
say("Cluster ready")
|
@@ -168,6 +165,19 @@ generate_empty_files <- function(out_base) {
|
|
168
165
|
file.create(paste(out_base,".1.medoids",sep=""))
|
169
166
|
}
|
170
167
|
|
168
|
+
write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
|
169
|
+
say("Text report")
|
170
|
+
write.table(ani.medoids, paste(out_base, "medoids", sep="."),
|
171
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE)
|
172
|
+
classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
|
173
|
+
ani.d.m <- 100 - as.matrix(ani.d)*100
|
174
|
+
for(j in 1:nrow(classif)){
|
175
|
+
classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
|
176
|
+
}
|
177
|
+
write.table(classif, paste(out_base,"classif",sep="."),
|
178
|
+
quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
|
179
|
+
}
|
180
|
+
|
171
181
|
plot_silhouette <- function(k, s, ns, ds, top.n) {
|
172
182
|
# s
|
173
183
|
par(mar=c(4,5,1,5)+0.1)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: miga-base
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luis M. Rodriguez-R
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02-
|
11
|
+
date: 2019-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: daemons
|