miga-base 0.3.7.1 → 0.3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aeb46208fbfdb522754876da59589895784fb72de06afdded9000066f34c569d
4
- data.tar.gz: 61d2afe7e630ebc4635b38f3ee72ba89e7faebfaec502f130bb9bcadc970b951
3
+ metadata.gz: c7f7bb9ba42ccdbed81ce05484031e17a43c6e688e89c5622327aadbee9d0f31
4
+ data.tar.gz: 3ca5e3189bb65b213fe43a948710d69ea304dac4c838c176838773fc87a88c5b
5
5
  SHA512:
6
- metadata.gz: 7ce52a87fa8f5fa8609546b6d98d9f8e49c139ad8894f45954cede0b3ce2dea7a7035dbdf308a0fb0478ea630e51d86476cc67102182091151e21fc6e6c698f3
7
- data.tar.gz: e108be9b315fe8315278e2ff22fbf86e3896c01e132dedb0f0e2183e0415626beece1ea4d82318b5365b4b84a60294a9728171ef37efff2650b8f39c062f28cc
6
+ metadata.gz: 9fa0deb9770be85a71145fcedd3a8a590ad7a6584f274245ed47df7decde6ddcb8d48c635585773a7005ceca4b03d37e67620346faff3eea387803d113a726f8
7
+ data.tar.gz: 17eeaad6dc985ff89d813ece94d3e2837fac4a40223cde14063b3236ba5dbd7ff581c6e7df30a4919cb002dd6918a42ac820cf8a3045f99714486f05bbc252fe
data/README.md CHANGED
@@ -11,9 +11,10 @@
11
11
 
12
12
  For additional information on MiGA, visit:
13
13
 
14
+ * [MiGA Online][miga-online]: The Microbial Genomes Atlas Online.
14
15
  * [MiGA users list][mailing-list]:
15
16
  Forum to discuss with other users and developers.
16
- * [MiGA manual][gitbook]: The definitive guide to MiGA.
17
+ * [MiGA manual][manual]: The definitive guide to MiGA.
17
18
  * [MiGA API docs][rubydoc]: Inner-workings of the `miga-base` gem.
18
19
  * [MiGA Web][miga-web]: MiGA on Rails!
19
20
 
@@ -46,8 +47,8 @@ Technology and [RDP][rdp] at Michigan State University.
46
47
  See [LICENSE](LICENSE).
47
48
 
48
49
  [lrr]: http://lmrodriguezr.github.io/
49
- [mailing-list]: https://groups.google.com/forum/#!forum/miga-users
50
- [gitbook]: https://miga.gitbooks.io/miga/content/
50
+ [mailing-list]: http://support.microbial-genomes.org/
51
+ [manual]: https://manual.microbial-genomes.org/
51
52
  [rubydoc]: http://www.rubydoc.info/github/bio-miga/miga
52
53
  [contact]: http://enve-omics.gatech.edu/node/7
53
54
  [miga-web]: https://github.com/bio-miga/miga-web
data/actions/init.rb CHANGED
@@ -229,7 +229,7 @@ unless File.exist?(daemon_f) and ask_user(
229
229
  v[:latency] = ask_user(
230
230
  'How long should I sleep? (in seconds)', '150').to_i
231
231
  v[:maxjobs] = ask_user('How many jobs can I launch at once?', '300').to_i
232
- v[:ppn] = ask_user('How many CPUs can I use per job?', '4').to_i
232
+ v[:ppn] = ask_user('How many CPUs can I use per job?', '2').to_i
233
233
  $stderr.puts 'Setting up internal daemon defaults.'
234
234
  $stderr.puts 'If you don\'t understand this just leave default values:'
235
235
  v[:cmd] = ask_user(
@@ -245,7 +245,7 @@ unless File.exist?(daemon_f) and ask_user(
245
245
  v[:alive] = ask_user(
246
246
  "How can I know that a process is still alive?\n %1$s: job id, " +
247
247
  "output should be 1 for running and 0 for non-running.\n",
248
- "squeue -h -o %t -j '%1$s' | grep '^PD\|R\|CF\|CG$' " +
248
+ "squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " +
249
249
  "| tail -n 1 | wc -l")
250
250
  v[:kill] = ask_user(
251
251
  "How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
@@ -254,7 +254,7 @@ unless File.exist?(daemon_f) and ask_user(
254
254
  v[:latency] = ask_user(
255
255
  'How long should I sleep? (in seconds)', '150').to_i
256
256
  v[:maxjobs] = ask_user('How many jobs can I launch at once?', '300').to_i
257
- v[:ppn] = ask_user('How many CPUs can I use per job?', '4').to_i
257
+ v[:ppn] = ask_user('How many CPUs can I use per job?', '2').to_i
258
258
  $stderr.puts 'Setting up internal daemon defaults.'
259
259
  $stderr.puts 'If you don\'t understand this just leave default values:'
260
260
  v[:cmd] = ask_user(
data/actions/ncbi_get.rb CHANGED
@@ -4,26 +4,25 @@
4
4
  # @license Artistic-2.0
5
5
 
6
6
  require 'miga/remote_dataset'
7
+ require 'csv'
7
8
 
8
9
  o = {q:true, query:false, unlink:false,
9
- reference: false, ignore_plasmids: false,
10
- complete: false, chromosome: false,
11
- scaffold: false, contig: false, add_version: true, dry: false,
12
- get_md: false}
10
+ reference: false, legacy_name: false,
11
+ complete: false, chromosome: false,
12
+ scaffold: false, contig: false, add_version: true, dry: false,
13
+ get_md: false}
13
14
  OptionParser.new do |opt|
14
15
  opt_banner(opt)
15
16
  opt_object(opt, o, [:project])
16
17
  opt.on('-T', '--taxon STRING',
17
- '(Mandatory unless --reference) Taxon name (e.g., a species binomial).'
18
- ){ |v| o[:taxon]=v }
18
+ '(Mandatory unless --reference) Taxon name (e.g., a species binomial).'
19
+ ){ |v| o[:taxon]=v }
19
20
  opt.on('--reference',
20
- 'Download all reference genomes (ignores -T).'){ |v| o[:reference]=v }
21
- opt.on('--ref-no-plasmids',
22
- 'If passed, ignores plasmids (only for --reference).'
23
- ){ |v| o[:ignore_plasmids]=v }
21
+ 'Download all reference genomes (ignores any other status).'
22
+ ){ |v| o[:reference]=v }
24
23
  opt.on('--complete', 'Download complete genomes.'){ |v| o[:complete]=v }
25
24
  opt.on('--chromosome',
26
- 'Download complete chromosomes.'){ |v| o[:chromosome]=v }
25
+ 'Download complete chromosomes.'){ |v| o[:chromosome]=v }
27
26
  opt.on('--scaffold', 'Download genomes in scaffolds.'){ |v| o[:scaffold]=v }
28
27
  opt.on('--contig', 'Download genomes in contigs.'){ |v| o[:contig]=v }
29
28
  opt.on('--all', 'Download all genomes (in any status).') do
@@ -33,23 +32,26 @@ OptionParser.new do |opt|
33
32
  o[:contig] = true
34
33
  end
35
34
  opt.on('--no-version-name',
36
- 'Do not add sequence version to the dataset name.',
37
- 'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
35
+ 'Do not add sequence version to the dataset name.',
36
+ 'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
37
+ opt.on('--legacy-name',
38
+ 'Use dataset names based on chromosome entries instead of assembly.'
39
+ ){ |v| o[:legacy_name] = v }
38
40
  opt.on('--blacklist PATH',
39
- 'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
41
+ 'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
40
42
  opt.on('--dry', 'Do not download or save the datasets.'){ |v| o[:dry] = v }
41
43
  opt.on('--get-metadata',
42
- 'Only download and update metadata for existing datasets'
43
- ){ |v| o[:get_md] = v }
44
+ 'Only download and update metadata for existing datasets'
45
+ ){ |v| o[:get_md] = v }
44
46
  opt.on('-q', '--query',
45
- 'Register the datasets as queries, not reference datasets.'
46
- ){ |v| o[:query]=v }
47
+ 'Register the datasets as queries, not reference datasets.'
48
+ ){ |v| o[:query]=v }
47
49
  opt.on('-u', '--unlink',
48
- 'Unlink all datasets in the project missing from the download list.'
49
- ){ |v| o[:unlink]=v }
50
+ 'Unlink all datasets in the project missing from the download list.'
51
+ ){ |v| o[:unlink]=v }
50
52
  opt.on('-R', '--remote-list PATH',
51
- 'Path to an output file with the list of all datasets listed remotely.'
52
- ){ |v| o[:remote_list]=v }
53
+ 'Path to an output file with the list of all datasets listed remotely.'
54
+ ){ |v| o[:remote_list]=v }
53
55
  opt.on('--api-key STRING', 'NCBI API key.'){ |v| ENV['NCBI_API_KEY'] = v }
54
56
  opt_common(opt, o)
55
57
  end.parse!
@@ -68,85 +70,78 @@ d = []
68
70
  ds = {}
69
71
  downloaded = 0
70
72
 
71
- def get_list(taxon, status)
72
- url_base = 'https://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?'
73
- url_param = if status==:reference
74
- { action: 'refgenomes', download: 'on' }
75
- else
76
- { action: 'download', report: 'proks', group: '-- All Prokaryotes --',
77
- subgroup: '-- All Prokaryotes --', orgn: "#{taxon}[orgn]",
78
- status: status }
79
- end
80
- url = url_base + URI.encode_www_form(url_param)
81
- MiGA::RemoteDataset.download_url url
82
- end
83
-
84
- # Download IDs with reference status
73
+ url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
74
+ url_param = {
75
+ q: '[display()].' +
76
+ 'from(GenomeAssemblies).' +
77
+ 'usingschema(/schema/GenomeAssemblies).' +
78
+ 'matching(tab==["Prokaryotes"] and q=="' + o[:taxon].tr('"',"'") + '"',
79
+ fields: 'organism|organism,assembly|assembly,replicons|replicons,' +
80
+ 'level|level,ftp_path_genbank|ftp_path_genbank,release_date|release_date,' +
81
+ 'strain|strain',
82
+ nolimit: 'on',
83
+ }
85
84
  if o[:reference]
86
- $stderr.puts 'Downloading reference genomes' unless o[:q]
87
- lineno = 0
88
- get_list(nil, :reference).each_line do |ln|
89
- next if (lineno+=1)==1
90
- r = ln.chomp.split("\t")
91
- next if r[3].nil? or r[3].empty?
92
- ids = r[3].split(',')
93
- ids += r[5].split(',') unless o[:ignore_plasmids] or r[5].empty?
94
- ids.delete_if{ |i| i =~ /\A\-*\z/ }
95
- next if ids.empty?
96
- n = r[2].miga_name
97
- ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
98
- end
85
+ url_param[:q] += ' and refseq_category==["representative"]'
86
+ else
87
+ status = {
88
+ complete: 'Complete',
89
+ chromosome: ' Chromosome', # <- The leading space is *VERY* important!
90
+ scaffold: 'Scaffold',
91
+ contig: 'Contig'
92
+ }.map { |k, v| '"' + v + '"' if o[k] }.compact.join(',')
93
+ url_param[:q] += ' and level==[' + status + ']'
99
94
  end
95
+ url_param[:q] += ')'
96
+ url = url_base + URI.encode_www_form(url_param)
97
+ $stderr.puts 'Downloading genome list' unless o[:q]
98
+ lineno = 0
99
+ doc = MiGA::RemoteDataset.download_url(url)
100
+ CSV.parse(doc, headers: true).each do |r|
101
+ asm = r['assembly']
102
+ next if asm.nil? or asm.empty? or asm == '-'
100
103
 
101
- # Download IDs with complete or chromosome status
102
- if o[:complete] or o[:chromosome]
103
- status = (o[:complete] and o[:chromosome] ?
104
- '50|40' : o[:complete] ? '50' : '40')
105
- $stderr.puts 'Downloading complete/chromosome genomes' unless o[:q]
106
- lineno = 0
107
- get_list(o[:taxon], status).each_line do |ln|
108
- next if (lineno+=1)==1
109
- r = ln.chomp.split("\t")
110
- next if r[10].nil? or r[10].empty?
111
- ids = r[10].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').split(';')
112
- ids.delete_if{ |i| i =~ /\A\-*\z/ }
113
- next if ids.empty?
114
- acc = o[:add_version] ? ids[0] : ids[0].gsub(/\.\d+\Z/,'')
115
- n = "#{r[0]}_#{acc}".miga_name
116
- ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
117
- end
118
- end
104
+ # Get replicons
105
+ rep = r['replicons'].nil? ? nil : r['replicons'].
106
+ split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
119
107
 
120
- # Download IDs with scaffold or contig status
121
- if o[:scaffold] or o[:contig]
122
- status = (o[:scaffold] and o[:contig] ? '30|20' : o[:scaffold] ? '30' : '20')
123
- $stderr.puts "Downloading scaffold/contig genomes" unless o[:q]
124
- lineno = 0
125
- get_list(o[:taxon], status).each_line do |ln|
126
- next if (lineno+=1)==1
127
- r = ln.chomp.split("\t")
128
- next if r[7].nil? or r[7].empty?
129
- next if r[19].nil? or r[19].empty?
130
- asm = r[7].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').gsub(/\s/,'')
131
- ids = r[19].gsub(/\s/,'').split(';').delete_if{ |i| i =~ /\A\-*\z/ }.
132
- map{ |i| "#{i}/#{File.basename(i)}_genomic.fna.gz" }
133
- next if ids.empty?
134
- n = "#{r[0]}_#{asm}".miga_name
135
- asm.gsub!(/\(.*\)/, '')
136
- ds[n] = {ids: ids, md: {type: :genome, ncbi_asm: asm},
137
- db: :assembly_gz, universe: :web}
108
+ # Set name
109
+ if o[:legacy_name] and o[:reference]
110
+ n = r['#organism'].miga_name
111
+ else
112
+ if o[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
113
+ acc = rep.nil? ? '' : rep.first
114
+ else
115
+ acc = asm
116
+ end
117
+ acc.gsub!(/\.\d+\Z/, '') unless o[:add_version]
118
+ n = "#{r['#organism']}_#{acc}".miga_name
138
119
  end
120
+
121
+ # Register for download
122
+ fna_url = r['ftp_path_genbank'] + '/' +
123
+ File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
124
+ ds[n] = {
125
+ ids: [fna_url], db: :assembly_gz, universe: :web,
126
+ md: {
127
+ type: :genome, ncbi_asm: asm, strain: r['strain']
128
+ }
129
+ }
130
+ ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
131
+ ds[n][:md][:release_date] =
132
+ Time.parse(r['release_date']).to_s unless r['release_date'].nil?
139
133
  end
140
134
 
141
135
  # Discard blacklisted
142
136
  unless o[:blacklist].nil?
143
137
  $stderr.puts "Discarding datasets in #{o[:blacklist]}." unless o[:q]
144
- File.readlines(o[:blacklist]).map(&:chomp).each{ |i| ds.delete i }
138
+ File.readlines(o[:blacklist]).
139
+ select{ |i| i !~ /^#/ }.map(&:chomp).each{ |i| ds.delete i }
145
140
  end
146
141
 
147
142
  # Download entries
148
143
  $stderr.puts "Downloading #{ds.size} " +
149
- (ds.size > 1 ? "entries" : "entry") unless o[:q]
144
+ (ds.size == 1 ? "entry" : "entries") unless o[:q]
150
145
  ds.each do |name,body|
151
146
  d << name
152
147
  puts name
data/bin/miga CHANGED
@@ -126,7 +126,8 @@ end
126
126
  def opt_common(opt, o)
127
127
  opt.on("-v", "--verbose",
128
128
  "Print additional information to STDERR."){ o[:q]=false }
129
- opt.on("-d", "--debug INT", "Print debugging information to STDERR.") do |v|
129
+ opt.on("-d", "--debug INT",
130
+ "Print debugging information to STDERR (1: debug, 2: trace).") do |v|
130
131
  v.to_i>1 ? MiGA::MiGA.DEBUG_TRACE_ON : MiGA::MiGA.DEBUG_ON
131
132
  end
132
133
  opt.on("-h", "--help", "Display this screen.") do
data/lib/miga/daemon.rb CHANGED
@@ -12,11 +12,11 @@ class MiGA::Daemon < MiGA::MiGA
12
12
 
13
13
  ##
14
14
  # When was the last time a daemon for the MiGA::Project +project+ was seen
15
- # active? Returns DateTime.
15
+ # active? Returns Time.
16
16
  def self.last_alive(project)
17
17
  f = File.expand_path('daemon/alive', project.path)
18
18
  return nil unless File.exist? f
19
- DateTime.parse(File.read(f))
19
+ Time.parse(File.read(f))
20
20
  end
21
21
 
22
22
  # Array of all spawned daemons.
@@ -49,7 +49,7 @@ class MiGA::Daemon < MiGA::MiGA
49
49
 
50
50
  ##
51
51
  # When was the last time a daemon for the current project was seen active?
52
- # Returns DateTime.
52
+ # Returns Time.
53
53
  def last_alive
54
54
  MiGA::Daemon.last_alive project
55
55
  end
@@ -229,6 +229,10 @@ class MiGA::Daemon < MiGA::MiGA
229
229
  @loop_i += 1
230
230
  check_datasets
231
231
  check_project
232
+ if shutdown_when_done? and jobs_running.size + jobs_to_run.size == 0
233
+ say 'Nothing else to do, shutting down.'
234
+ return false
235
+ end
232
236
  flush!
233
237
  if loop_i==4
234
238
  say 'Housekeeping for sanity'
@@ -237,10 +241,6 @@ class MiGA::Daemon < MiGA::MiGA
237
241
  end
238
242
  report_status
239
243
  sleep(latency)
240
- if shutdown_when_done? and jobs_running.size+jobs_to_run.size == 0
241
- say 'Nothing else to do, shutting down.'
242
- return false
243
- end
244
244
  true
245
245
  end
246
246
 
@@ -215,7 +215,7 @@ module MiGA::Dataset::Result
215
215
  r.clean! if opts[:is_clean]
216
216
  unless r.clean?
217
217
  MiGA::MiGA.clean_fasta_file(r.file_path :proteins)
218
- MiGA::MiGA.clean_fasta_file(r.file_path :genes)
218
+ MiGA::MiGA.clean_fasta_file(r.file_path :genes) if r.file_path :genes
219
219
  r.clean!
220
220
  end
221
221
  r
@@ -14,13 +14,15 @@ end
14
14
  module MiGA::RemoteDataset::Base
15
15
 
16
16
  @@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
17
+ @@_NCBI_API_KEY = lambda { |url|
18
+ ENV['NCBI_API_KEY'].nil? ? url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
17
19
 
18
20
  ##
19
21
  # Structure of the different database Universes or containers. The structure
20
22
  # is a Hash with universe names as keys as Symbol and values being a Hash with
21
23
  # supported keys as Symbol:
22
24
  # - +:dbs+ => Hash with keys being the database name and the values a Hash of
23
- # properties such as +stage+, +format+, and +map_to+.
25
+ # properties such as +stage+, +format+, +map_to+, and +getter+.
24
26
  # - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
25
27
  # is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
26
28
  # Additional parameters can be passed to certain functions using the +extra+
@@ -37,21 +39,23 @@ module MiGA::RemoteDataset::Base
37
39
  assembly_gz: {stage: :assembly, format: :fasta_gz},
38
40
  text: {stage: :metadata, format: :text}
39
41
  },
40
- url: "%2$s",
42
+ url: '%2$s',
41
43
  method: :net
42
44
  },
43
45
  ebi: {
44
46
  dbs: { embl: {stage: :assembly, format: :fasta} },
45
- url: "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
47
+ url: 'https://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s',
46
48
  method: :rest
47
49
  },
48
50
  ncbi: {
49
- dbs: { nuccore: {stage: :assembly, format: :fasta} },
51
+ dbs: {
52
+ nuccore: { stage: :assembly, format: :fasta },
53
+ assembly: { stage: :assembly, format: :fasta_gz, getter: :ncbi_asm },
54
+ taxonomy: { stage: :metadata, format: :xml }
55
+ },
50
56
  url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
51
57
  method: :rest,
52
- api_key: lambda { |url|
53
- ENV['NCBI_API_KEY'].nil? ?
54
- url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
58
+ api_key: @@_NCBI_API_KEY
55
59
  },
56
60
  ncbi_map: {
57
61
  dbs: {
@@ -62,9 +66,19 @@ module MiGA::RemoteDataset::Base
62
66
  url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
63
67
  method: :net,
64
68
  map_to_universe: :ncbi,
65
- api_key: lambda { |url|
66
- ENV['NCBI_API_KEY'].nil? ?
67
- url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
69
+ api_key: @@_NCBI_API_KEY
70
+ },
71
+ ncbi_summary: {
72
+ dbs: { assembly: { stage: :metadata, format: :json } },
73
+ url: "#{@@_EUTILS}esummary.fcgi?db=%1$s&id=%2$s&retmode=%3$s",
74
+ method: :rest,
75
+ api_key: @@_NCBI_API_KEY
76
+ },
77
+ ncbi_search: {
78
+ dbs: { assembly: { stage: :metadata, format: :json } },
79
+ url: "#{@@_EUTILS}esearch.fcgi?db=%1$s&term=%2$s&retmode=%3$s",
80
+ method: :rest,
81
+ api_key: @@_NCBI_API_KEY
68
82
  }
69
83
  }
70
84
 
@@ -10,15 +10,22 @@ class MiGA::RemoteDataset
10
10
  # Download data from the +universe+ in the database +db+ with IDs +ids+ and
11
11
  # in +format+. If passed, it saves the result in +file+. Additional
12
12
  # parameters specific to the download method can be passed using +extra+.
13
- # Returns String.
14
- def download(universe, db, ids, format, file = nil, extra = [])
13
+ # Returns String. The +obj+ can also be passed as MiGA::RemoteDataset or
14
+ # MiGA::Dataset.
15
+ def download(universe, db, ids, format, file = nil, extra = [], obj = nil)
15
16
  ids = [ids] unless ids.is_a? Array
16
- case @@UNIVERSE[universe][:method]
17
- when :rest
18
- doc = download_rest(universe, db, ids, format, extra)
19
- when :net
20
- doc = download_net(universe, db, ids, format, extra)
21
- end
17
+ getter = @@UNIVERSE[universe][:dbs][db][:getter] || :download
18
+ method = @@UNIVERSE[universe][:method]
19
+ opts = {
20
+ universe: universe,
21
+ db: db,
22
+ ids: ids,
23
+ format: format,
24
+ file: file,
25
+ extra: extra,
26
+ obj: obj
27
+ }
28
+ doc = send("#{getter}_#{method}", opts)
22
29
  unless file.nil?
23
30
  ofh = File.open(file, 'w')
24
31
  ofh.print doc
@@ -28,20 +35,37 @@ class MiGA::RemoteDataset
28
35
  end
29
36
 
30
37
  ##
31
- # Download data using a REST method from the +universe+ in the database +db+
32
- # with IDs +ids+ and in +format+. Additional URL parameters can be passed
33
- # using +extra+. Returns the doc as String.
34
- def download_rest(universe, db, ids, format, extra = [])
35
- u = @@UNIVERSE[universe]
36
- url = sprintf(u[:url], db, ids.join(","), format, *extra)
38
+ # Download data from NCBI Assembly database using the REST method.
39
+ # Supported +opts+ (Hash) include:
40
+ # +obj+ (mandatory): MiGA::RemoteDataset
41
+ # +ids+ (mandatory): String or Array of String
42
+ # +file+: String, passed to download
43
+ # +extra+: Array, passed to download
44
+ # +format+: String, passed to download
45
+ def ncbi_asm_rest(opts)
46
+ url_dir = opts[:obj].ncbi_asm_json_doc['ftppath_genbank']
47
+ url = "#{url_dir}/#{File.basename url_dir}_genomic.fna.gz"
48
+ download(:web, :assembly_gz, url,
49
+ opts[:format], opts[:file], opts[:extra], opts[:obj])
50
+ end
51
+
52
+ ##
53
+ # Download data using the REST method. Supported +opts+ (Hash) include:
54
+ # +universe+ (mandatory): Symbol
55
+ # +db+ (mandatory): Symbol
56
+ # +ids+ (mandatory): Array of String
57
+ # +format+: String
58
+ # +extra+: Array
59
+ def download_rest(opts)
60
+ u = @@UNIVERSE[opts[:universe]]
61
+ url = sprintf(u[:url],
62
+ opts[:db], opts[:ids].join(','), opts[:format], *opts[:extra])
37
63
  url = u[:api_key][url] unless u[:api_key].nil?
38
64
  download_url url
39
65
  end
40
66
 
41
67
  ##
42
- # Download data using a GET request from the +universe+ in the database +db+
43
- # with IDs +ids+ and in +format+. Additional URL parameters can be passed
44
- # using +extra+. Returns the doc as String.
68
+ # Alias of download_rest
45
69
  alias download_net download_rest
46
70
 
47
71
  ##
@@ -51,6 +75,7 @@ class MiGA::RemoteDataset
51
75
  doc = ''
52
76
  @timeout_try = 0
53
77
  begin
78
+ DEBUG 'GET: ' + url
54
79
  open(url, read_timeout: 600) { |f| doc = f.read }
55
80
  rescue => e
56
81
  @timeout_try += 1
@@ -82,6 +107,6 @@ module MiGA::RemoteDataset::Download
82
107
  # Download data into +file+.
83
108
  def download(file)
84
109
  self.class.download(universe, db, ids,
85
- self.class.UNIVERSE[universe][:dbs][db][:format], file)
110
+ self.class.UNIVERSE[universe][:dbs][db][:format], file, [], self)
86
111
  end
87
112
  end
@@ -8,6 +8,16 @@ require 'miga/remote_dataset/download'
8
8
  class MiGA::RemoteDataset < MiGA::MiGA
9
9
  include MiGA::RemoteDataset::Download
10
10
 
11
+ # Class-level
12
+
13
+ class << self
14
+ def ncbi_asm_acc2id(acc)
15
+ return acc if acc =~ /^\d+$/
16
+ search_doc = JSON.parse download(:ncbi_search, :assembly, acc, :json)
17
+ search_doc['esearchresult']['idlist'].first
18
+ end
19
+ end
20
+
11
21
  # Instance-level
12
22
 
13
23
  ##
@@ -19,6 +29,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
19
29
  attr_reader :ids
20
30
  # Internal metadata hash
21
31
  attr_reader :metadata
32
+ # NCBI Assembly XML document
33
+ @_ncbi_asm_xml_doc = nil
22
34
 
23
35
  ##
24
36
  # Initialize MiGA::RemoteDataset with +ids+ in database +db+ from +universe+.
@@ -33,6 +45,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
33
45
  raise "Unknown Universe: #{@universe}. Try: #{@@UNIVERSE.keys}"
34
46
  @@UNIVERSE[@universe][:dbs].include?(@db) or
35
47
  raise "Unknown Database: #{@db}. Try: #{@@UNIVERSE[@universe][:dbs]}"
48
+ @_ncbi_asm_json_doc = nil
36
49
  # FIXME: Part of the +map_to+ support:
37
50
  # unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
38
51
  # MiGA::RemoteDataset.download
@@ -87,7 +100,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
87
100
  ##
88
101
  # Get NCBI Taxonomy ID.
89
102
  def get_ncbi_taxid
90
- send("get_ncbi_taxid_from_#{universe}")
103
+ origin = (universe == :ncbi and db == :assembly) ? :web : universe
104
+ send("get_ncbi_taxid_from_#{origin}")
91
105
  end
92
106
 
93
107
  ##
@@ -107,6 +121,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
107
121
  # Get NCBI taxonomy as MiGA::Taxonomy.
108
122
  def get_ncbi_taxonomy
109
123
  tax_id = get_ncbi_taxid
124
+ return nil if tax_id.nil?
110
125
  lineage = {}
111
126
  doc = MiGA::RemoteDataset.download(:ncbi, :taxonomy, tax_id, :xml)
112
127
  doc.scan(%r{<Taxon>(.*?)</Taxon>}m).map(&:first).each do |i|
@@ -119,15 +134,24 @@ class MiGA::RemoteDataset < MiGA::MiGA
119
134
  MiGA::Taxonomy.new(lineage)
120
135
  end
121
136
 
137
+ ##
138
+ # Get the JSON document describing an NCBI assembly entry.
139
+ def ncbi_asm_json_doc
140
+ return @_ncbi_asm_json_doc unless @_ncbi_asm_json_doc.nil?
141
+ metadata[:ncbi_asm] ||= ids.first if universe == :ncbi and db == :assembly
142
+ return nil unless metadata[:ncbi_asm]
143
+ ncbi_asm_id = self.class.ncbi_asm_acc2id metadata[:ncbi_asm]
144
+ doc = JSON.parse(
145
+ self.class.download(:ncbi_summary, :assembly, ncbi_asm_id, :json))
146
+ @_ncbi_asm_json_doc = doc['result'][ doc['result']['uids'].first ]
147
+ end
148
+
149
+
122
150
  private
123
151
 
124
152
  def get_ncbi_taxid_from_web
125
- return nil unless metadata[:ncbi_asm]
126
- base_url = 'https://www.ncbi.nlm.nih.gov/assembly'
127
- doc = self.class.download_url(
128
- "#{base_url}/#{metadata[:ncbi_asm]}?report=xml&format=text")
129
- taxid = doc.scan(%r{&lt;Taxid&gt;(\S+)&lt;/Taxid&gt;}).first
130
- taxid.nil? ? taxid : taxid.first
153
+ return nil if ncbi_asm_json_doc.nil?
154
+ ncbi_asm_json_doc['taxid']
131
155
  end
132
156
 
133
157
  def get_ncbi_taxid_from_ncbi
@@ -154,29 +178,28 @@ class MiGA::RemoteDataset < MiGA::MiGA
154
178
  biosample = self.class.ncbi_map(metadata[:ncbi_nuccore],
155
179
  :nuccore, :biosample)
156
180
  return metadata if biosample.nil?
157
- asm = self.class.ncbi_map(biosample,
158
- :biosample, :assembly)
181
+ asm = self.class.ncbi_map(biosample, :biosample, :assembly)
159
182
  metadata[:ncbi_asm] = asm.to_s unless asm.nil?
160
183
  get_type_status_ncbi_asm metadata
161
184
  end
162
185
 
163
186
  def get_type_status_ncbi_asm(metadata)
164
- return metadata if metadata[:ncbi_asm].nil?
165
- doc = CGI.unescapeHTML(self.class.download(:web, :text,
166
- "https://www.ncbi.nlm.nih.gov/assembly/" \
167
- "#{metadata[:ncbi_asm]}?report=xml", :xml)).each_line
168
- from_type = doc.grep(%r{<FromType/?>}).first or return metadata
169
- if from_type =~ %r{<FromType/>}
187
+ return metadata if ncbi_asm_json_doc.nil?
188
+ from_type = ncbi_asm_json_doc['from_type']
189
+ from_type = ncbi_asm_json_doc['fromtype'] if from_type.nil?
190
+ case from_type
191
+ when nil
192
+ # Do nothing
193
+ when ''
170
194
  metadata[:is_type] = false
171
195
  metadata[:is_ref_type] = false
172
- elsif from_type =~ %r{<FromType>(.*)</FromType>}
173
- if $1 == 'assembly from reference material'
174
- metadata[:is_type] = false
175
- metadata[:is_ref_type] = true
176
- else
177
- metadata[:is_type] = true
178
- end
179
- metadata[:type_rel] = $1
196
+ when 'assembly from reference material'
197
+ metadata[:is_type] = false
198
+ metadata[:is_ref_type] = true
199
+ metadata[:type_rel] = from_type
200
+ else
201
+ metadata[:is_type] = true
202
+ metadata[:type_rel] = from_type
180
203
  end
181
204
  metadata
182
205
  end
@@ -7,14 +7,14 @@ module MiGA::Result::Dates
7
7
  include MiGA::Result::Base
8
8
 
9
9
  ##
10
- # Returns the start date of processing as DateTime or +nil+ if it doesn't
10
+ # Returns the start date of processing as Time or +nil+ if it doesn't
11
11
  # exist.
12
12
  def started_at
13
13
  date_at :start
14
14
  end
15
15
 
16
16
  ##
17
- # Returns the end (done) date of processing as DateTime or +nil+ if it doesn't
17
+ # Returns the end (done) date of processing as Time or +nil+ if it doesn't
18
18
  # exist.
19
19
  def done_at
20
20
  date_at :done
@@ -38,7 +38,7 @@ module MiGA::Result::Dates
38
38
  f = path event
39
39
  date = File.read(f) if File.size? f
40
40
  end
41
- date.nil? ? nil : DateTime.parse(date)
41
+ date.nil? ? nil : Time.parse(date)
42
42
  end
43
43
  end
44
44
 
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.3, 7, 1]
13
+ VERSION = [0.3, 8, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -18,7 +18,7 @@ module MiGA
18
18
 
19
19
  ##
20
20
  # Date of the current gem release.
21
- VERSION_DATE = Date.today
21
+ VERSION_DATE = Date.new(2019, 02, 28)
22
22
 
23
23
  ##
24
24
  # Reference of MiGA.
data/test/daemon_test.rb CHANGED
@@ -55,7 +55,7 @@ class DaemonTest < Test::Unit::TestCase
55
55
  out = capture_stdout do
56
56
  d.in_loop
57
57
  end
58
- assert_equal(DateTime, d.last_alive.class)
58
+ assert_equal(Time, d.last_alive.class)
59
59
  assert(out.string =~ /-{20}\n.*MiGA:#{p.name} launched/)
60
60
  2.times{ d.in_loop }
61
61
  assert_equal(3, d.loop_i)
@@ -96,7 +96,7 @@ class DaemonTest < Test::Unit::TestCase
96
96
  d = MiGA::Daemon.new(p)
97
97
  assert_nil(d.last_alive)
98
98
  d.declare_alive
99
- assert(d.last_alive - DateTime.now < 1)
99
+ assert(d.last_alive - Time.now < 1)
100
100
  end
101
101
 
102
102
  def test_options
@@ -68,7 +68,7 @@ module MiGA::DistanceRunner::Database
68
68
  if dataset.is_ref? and project.path == ref_project.path
69
69
  y = data_from_db(
70
70
  target.name, dataset.name, ref_db(metric, target.name), metric)
71
- unless y.nil? or y.first.zero?
71
+ unless y.nil? or y.first.nil? or y.first.zero?
72
72
  # Store a copy
73
73
  data_to_db(dataset.name, target.name, tmp_dbs[metric], metric, y)
74
74
  return y.first
data/utils/subclades.R CHANGED
@@ -48,12 +48,18 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
48
48
  ani.types <- a[,2]
49
49
  names(ani.types) <- a[,1]
50
50
  if(length(ani.d) == 0) load(dist_rdata)
51
- }else{
51
+ }else if(length(labels(ani.d)) > 8L){
52
52
  res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
53
53
  if(length(res) == 0) return(NULL)
54
54
  ani.medoids <- res[['ani.medoids']]
55
55
  ani.types <- res[['ani.types']]
56
56
  ani.d <- res[['ani.d']]
57
+ }else{
58
+ ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
59
+ ani.types <- rep(1, length(labels(ani.d)))
60
+ names(ani.types) <- labels(ani.d)
61
+ generate_empty_files(out_base)
62
+ write_text_report(out_base, ani.d, ani.medoids, ani.types)
57
63
  }
58
64
 
59
65
  # Recursive search
@@ -136,16 +142,7 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
136
142
  dev.off()
137
143
 
138
144
  # Save results
139
- say("Text report")
140
- write.table(ani.medoids, paste(out_base, "medoids", sep="."),
141
- quote=FALSE, col.names=FALSE, row.names=FALSE)
142
- classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
143
- ani.d.m <- 100 - as.matrix(ani.d)*100
144
- for(j in 1:nrow(classif)){
145
- classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
146
- }
147
- write.table(classif, paste(out_base,"classif",sep="."),
148
- quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
145
+ write_text_report(out_base, ani.d, ani.medoids, ani.types)
149
146
 
150
147
  # Return data
151
148
  say("Cluster ready")
@@ -168,6 +165,19 @@ generate_empty_files <- function(out_base) {
168
165
  file.create(paste(out_base,".1.medoids",sep=""))
169
166
  }
170
167
 
168
+ write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
169
+ say("Text report")
170
+ write.table(ani.medoids, paste(out_base, "medoids", sep="."),
171
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
172
+ classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
173
+ ani.d.m <- 100 - as.matrix(ani.d)*100
174
+ for(j in 1:nrow(classif)){
175
+ classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
176
+ }
177
+ write.table(classif, paste(out_base,"classif",sep="."),
178
+ quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
179
+ }
180
+
171
181
  plot_silhouette <- function(k, s, ns, ds, top.n) {
172
182
  # s
173
183
  par(mar=c(4,5,1,5)+0.1)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.7.1
4
+ version: 0.3.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-08 00:00:00.000000000 Z
11
+ date: 2019-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons