miga-base 0.3.7.1 → 0.3.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aeb46208fbfdb522754876da59589895784fb72de06afdded9000066f34c569d
4
- data.tar.gz: 61d2afe7e630ebc4635b38f3ee72ba89e7faebfaec502f130bb9bcadc970b951
3
+ metadata.gz: c7f7bb9ba42ccdbed81ce05484031e17a43c6e688e89c5622327aadbee9d0f31
4
+ data.tar.gz: 3ca5e3189bb65b213fe43a948710d69ea304dac4c838c176838773fc87a88c5b
5
5
  SHA512:
6
- metadata.gz: 7ce52a87fa8f5fa8609546b6d98d9f8e49c139ad8894f45954cede0b3ce2dea7a7035dbdf308a0fb0478ea630e51d86476cc67102182091151e21fc6e6c698f3
7
- data.tar.gz: e108be9b315fe8315278e2ff22fbf86e3896c01e132dedb0f0e2183e0415626beece1ea4d82318b5365b4b84a60294a9728171ef37efff2650b8f39c062f28cc
6
+ metadata.gz: 9fa0deb9770be85a71145fcedd3a8a590ad7a6584f274245ed47df7decde6ddcb8d48c635585773a7005ceca4b03d37e67620346faff3eea387803d113a726f8
7
+ data.tar.gz: 17eeaad6dc985ff89d813ece94d3e2837fac4a40223cde14063b3236ba5dbd7ff581c6e7df30a4919cb002dd6918a42ac820cf8a3045f99714486f05bbc252fe
data/README.md CHANGED
@@ -11,9 +11,10 @@
11
11
 
12
12
  For additional information on MiGA, visit:
13
13
 
14
+ * [MiGA Online][miga-online]: The Microbial Genomes Atlas Online.
14
15
  * [MiGA users list][mailing-list]:
15
16
  Forum to discuss with other users and developers.
16
- * [MiGA manual][gitbook]: The definitive guide to MiGA.
17
+ * [MiGA manual][manual]: The definitive guide to MiGA.
17
18
  * [MiGA API docs][rubydoc]: Inner-workings of the `miga-base` gem.
18
19
  * [MiGA Web][miga-web]: MiGA on Rails!
19
20
 
@@ -46,8 +47,8 @@ Technology and [RDP][rdp] at Michigan State University.
46
47
  See [LICENSE](LICENSE).
47
48
 
48
49
  [lrr]: http://lmrodriguezr.github.io/
49
- [mailing-list]: https://groups.google.com/forum/#!forum/miga-users
50
- [gitbook]: https://miga.gitbooks.io/miga/content/
50
+ [mailing-list]: http://support.microbial-genomes.org/
51
+ [manual]: https://manual.microbial-genomes.org/
51
52
  [rubydoc]: http://www.rubydoc.info/github/bio-miga/miga
52
53
  [contact]: http://enve-omics.gatech.edu/node/7
53
54
  [miga-web]: https://github.com/bio-miga/miga-web
data/actions/init.rb CHANGED
@@ -229,7 +229,7 @@ unless File.exist?(daemon_f) and ask_user(
229
229
  v[:latency] = ask_user(
230
230
  'How long should I sleep? (in seconds)', '150').to_i
231
231
  v[:maxjobs] = ask_user('How many jobs can I launch at once?', '300').to_i
232
- v[:ppn] = ask_user('How many CPUs can I use per job?', '4').to_i
232
+ v[:ppn] = ask_user('How many CPUs can I use per job?', '2').to_i
233
233
  $stderr.puts 'Setting up internal daemon defaults.'
234
234
  $stderr.puts 'If you don\'t understand this just leave default values:'
235
235
  v[:cmd] = ask_user(
@@ -245,7 +245,7 @@ unless File.exist?(daemon_f) and ask_user(
245
245
  v[:alive] = ask_user(
246
246
  "How can I know that a process is still alive?\n %1$s: job id, " +
247
247
  "output should be 1 for running and 0 for non-running.\n",
248
- "squeue -h -o %t -j '%1$s' | grep '^PD\|R\|CF\|CG$' " +
248
+ "squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " +
249
249
  "| tail -n 1 | wc -l")
250
250
  v[:kill] = ask_user(
251
251
  "How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
@@ -254,7 +254,7 @@ unless File.exist?(daemon_f) and ask_user(
254
254
  v[:latency] = ask_user(
255
255
  'How long should I sleep? (in seconds)', '150').to_i
256
256
  v[:maxjobs] = ask_user('How many jobs can I launch at once?', '300').to_i
257
- v[:ppn] = ask_user('How many CPUs can I use per job?', '4').to_i
257
+ v[:ppn] = ask_user('How many CPUs can I use per job?', '2').to_i
258
258
  $stderr.puts 'Setting up internal daemon defaults.'
259
259
  $stderr.puts 'If you don\'t understand this just leave default values:'
260
260
  v[:cmd] = ask_user(
data/actions/ncbi_get.rb CHANGED
@@ -4,26 +4,25 @@
4
4
  # @license Artistic-2.0
5
5
 
6
6
  require 'miga/remote_dataset'
7
+ require 'csv'
7
8
 
8
9
  o = {q:true, query:false, unlink:false,
9
- reference: false, ignore_plasmids: false,
10
- complete: false, chromosome: false,
11
- scaffold: false, contig: false, add_version: true, dry: false,
12
- get_md: false}
10
+ reference: false, legacy_name: false,
11
+ complete: false, chromosome: false,
12
+ scaffold: false, contig: false, add_version: true, dry: false,
13
+ get_md: false}
13
14
  OptionParser.new do |opt|
14
15
  opt_banner(opt)
15
16
  opt_object(opt, o, [:project])
16
17
  opt.on('-T', '--taxon STRING',
17
- '(Mandatory unless --reference) Taxon name (e.g., a species binomial).'
18
- ){ |v| o[:taxon]=v }
18
+ '(Mandatory unless --reference) Taxon name (e.g., a species binomial).'
19
+ ){ |v| o[:taxon]=v }
19
20
  opt.on('--reference',
20
- 'Download all reference genomes (ignores -T).'){ |v| o[:reference]=v }
21
- opt.on('--ref-no-plasmids',
22
- 'If passed, ignores plasmids (only for --reference).'
23
- ){ |v| o[:ignore_plasmids]=v }
21
+ 'Download all reference genomes (ignores any other status).'
22
+ ){ |v| o[:reference]=v }
24
23
  opt.on('--complete', 'Download complete genomes.'){ |v| o[:complete]=v }
25
24
  opt.on('--chromosome',
26
- 'Download complete chromosomes.'){ |v| o[:chromosome]=v }
25
+ 'Download complete chromosomes.'){ |v| o[:chromosome]=v }
27
26
  opt.on('--scaffold', 'Download genomes in scaffolds.'){ |v| o[:scaffold]=v }
28
27
  opt.on('--contig', 'Download genomes in contigs.'){ |v| o[:contig]=v }
29
28
  opt.on('--all', 'Download all genomes (in any status).') do
@@ -33,23 +32,26 @@ OptionParser.new do |opt|
33
32
  o[:contig] = true
34
33
  end
35
34
  opt.on('--no-version-name',
36
- 'Do not add sequence version to the dataset name.',
37
- 'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
35
+ 'Do not add sequence version to the dataset name.',
36
+ 'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
37
+ opt.on('--legacy-name',
38
+ 'Use dataset names based on chromosome entries instead of assembly.'
39
+ ){ |v| o[:legacy_name] = v }
38
40
  opt.on('--blacklist PATH',
39
- 'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
41
+ 'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
40
42
  opt.on('--dry', 'Do not download or save the datasets.'){ |v| o[:dry] = v }
41
43
  opt.on('--get-metadata',
42
- 'Only download and update metadata for existing datasets'
43
- ){ |v| o[:get_md] = v }
44
+ 'Only download and update metadata for existing datasets'
45
+ ){ |v| o[:get_md] = v }
44
46
  opt.on('-q', '--query',
45
- 'Register the datasets as queries, not reference datasets.'
46
- ){ |v| o[:query]=v }
47
+ 'Register the datasets as queries, not reference datasets.'
48
+ ){ |v| o[:query]=v }
47
49
  opt.on('-u', '--unlink',
48
- 'Unlink all datasets in the project missing from the download list.'
49
- ){ |v| o[:unlink]=v }
50
+ 'Unlink all datasets in the project missing from the download list.'
51
+ ){ |v| o[:unlink]=v }
50
52
  opt.on('-R', '--remote-list PATH',
51
- 'Path to an output file with the list of all datasets listed remotely.'
52
- ){ |v| o[:remote_list]=v }
53
+ 'Path to an output file with the list of all datasets listed remotely.'
54
+ ){ |v| o[:remote_list]=v }
53
55
  opt.on('--api-key STRING', 'NCBI API key.'){ |v| ENV['NCBI_API_KEY'] = v }
54
56
  opt_common(opt, o)
55
57
  end.parse!
@@ -68,85 +70,78 @@ d = []
68
70
  ds = {}
69
71
  downloaded = 0
70
72
 
71
- def get_list(taxon, status)
72
- url_base = 'https://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?'
73
- url_param = if status==:reference
74
- { action: 'refgenomes', download: 'on' }
75
- else
76
- { action: 'download', report: 'proks', group: '-- All Prokaryotes --',
77
- subgroup: '-- All Prokaryotes --', orgn: "#{taxon}[orgn]",
78
- status: status }
79
- end
80
- url = url_base + URI.encode_www_form(url_param)
81
- MiGA::RemoteDataset.download_url url
82
- end
83
-
84
- # Download IDs with reference status
73
+ url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
74
+ url_param = {
75
+ q: '[display()].' +
76
+ 'from(GenomeAssemblies).' +
77
+ 'usingschema(/schema/GenomeAssemblies).' +
78
+ 'matching(tab==["Prokaryotes"] and q=="' + o[:taxon].tr('"',"'") + '"',
79
+ fields: 'organism|organism,assembly|assembly,replicons|replicons,' +
80
+ 'level|level,ftp_path_genbank|ftp_path_genbank,release_date|release_date,' +
81
+ 'strain|strain',
82
+ nolimit: 'on',
83
+ }
85
84
  if o[:reference]
86
- $stderr.puts 'Downloading reference genomes' unless o[:q]
87
- lineno = 0
88
- get_list(nil, :reference).each_line do |ln|
89
- next if (lineno+=1)==1
90
- r = ln.chomp.split("\t")
91
- next if r[3].nil? or r[3].empty?
92
- ids = r[3].split(',')
93
- ids += r[5].split(',') unless o[:ignore_plasmids] or r[5].empty?
94
- ids.delete_if{ |i| i =~ /\A\-*\z/ }
95
- next if ids.empty?
96
- n = r[2].miga_name
97
- ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
98
- end
85
+ url_param[:q] += ' and refseq_category==["representative"]'
86
+ else
87
+ status = {
88
+ complete: 'Complete',
89
+ chromosome: ' Chromosome', # <- The leading space is *VERY* important!
90
+ scaffold: 'Scaffold',
91
+ contig: 'Contig'
92
+ }.map { |k, v| '"' + v + '"' if o[k] }.compact.join(',')
93
+ url_param[:q] += ' and level==[' + status + ']'
99
94
  end
95
+ url_param[:q] += ')'
96
+ url = url_base + URI.encode_www_form(url_param)
97
+ $stderr.puts 'Downloading genome list' unless o[:q]
98
+ lineno = 0
99
+ doc = MiGA::RemoteDataset.download_url(url)
100
+ CSV.parse(doc, headers: true).each do |r|
101
+ asm = r['assembly']
102
+ next if asm.nil? or asm.empty? or asm == '-'
100
103
 
101
- # Download IDs with complete or chromosome status
102
- if o[:complete] or o[:chromosome]
103
- status = (o[:complete] and o[:chromosome] ?
104
- '50|40' : o[:complete] ? '50' : '40')
105
- $stderr.puts 'Downloading complete/chromosome genomes' unless o[:q]
106
- lineno = 0
107
- get_list(o[:taxon], status).each_line do |ln|
108
- next if (lineno+=1)==1
109
- r = ln.chomp.split("\t")
110
- next if r[10].nil? or r[10].empty?
111
- ids = r[10].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').split(';')
112
- ids.delete_if{ |i| i =~ /\A\-*\z/ }
113
- next if ids.empty?
114
- acc = o[:add_version] ? ids[0] : ids[0].gsub(/\.\d+\Z/,'')
115
- n = "#{r[0]}_#{acc}".miga_name
116
- ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
117
- end
118
- end
104
+ # Get replicons
105
+ rep = r['replicons'].nil? ? nil : r['replicons'].
106
+ split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
119
107
 
120
- # Download IDs with scaffold or contig status
121
- if o[:scaffold] or o[:contig]
122
- status = (o[:scaffold] and o[:contig] ? '30|20' : o[:scaffold] ? '30' : '20')
123
- $stderr.puts "Downloading scaffold/contig genomes" unless o[:q]
124
- lineno = 0
125
- get_list(o[:taxon], status).each_line do |ln|
126
- next if (lineno+=1)==1
127
- r = ln.chomp.split("\t")
128
- next if r[7].nil? or r[7].empty?
129
- next if r[19].nil? or r[19].empty?
130
- asm = r[7].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').gsub(/\s/,'')
131
- ids = r[19].gsub(/\s/,'').split(';').delete_if{ |i| i =~ /\A\-*\z/ }.
132
- map{ |i| "#{i}/#{File.basename(i)}_genomic.fna.gz" }
133
- next if ids.empty?
134
- n = "#{r[0]}_#{asm}".miga_name
135
- asm.gsub!(/\(.*\)/, '')
136
- ds[n] = {ids: ids, md: {type: :genome, ncbi_asm: asm},
137
- db: :assembly_gz, universe: :web}
108
+ # Set name
109
+ if o[:legacy_name] and o[:reference]
110
+ n = r['#organism'].miga_name
111
+ else
112
+ if o[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
113
+ acc = rep.nil? ? '' : rep.first
114
+ else
115
+ acc = asm
116
+ end
117
+ acc.gsub!(/\.\d+\Z/, '') unless o[:add_version]
118
+ n = "#{r['#organism']}_#{acc}".miga_name
138
119
  end
120
+
121
+ # Register for download
122
+ fna_url = r['ftp_path_genbank'] + '/' +
123
+ File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
124
+ ds[n] = {
125
+ ids: [fna_url], db: :assembly_gz, universe: :web,
126
+ md: {
127
+ type: :genome, ncbi_asm: asm, strain: r['strain']
128
+ }
129
+ }
130
+ ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
131
+ ds[n][:md][:release_date] =
132
+ Time.parse(r['release_date']).to_s unless r['release_date'].nil?
139
133
  end
140
134
 
141
135
  # Discard blacklisted
142
136
  unless o[:blacklist].nil?
143
137
  $stderr.puts "Discarding datasets in #{o[:blacklist]}." unless o[:q]
144
- File.readlines(o[:blacklist]).map(&:chomp).each{ |i| ds.delete i }
138
+ File.readlines(o[:blacklist]).
139
+ select{ |i| i !~ /^#/ }.map(&:chomp).each{ |i| ds.delete i }
145
140
  end
146
141
 
147
142
  # Download entries
148
143
  $stderr.puts "Downloading #{ds.size} " +
149
- (ds.size > 1 ? "entries" : "entry") unless o[:q]
144
+ (ds.size == 1 ? "entry" : "entries") unless o[:q]
150
145
  ds.each do |name,body|
151
146
  d << name
152
147
  puts name
data/bin/miga CHANGED
@@ -126,7 +126,8 @@ end
126
126
  def opt_common(opt, o)
127
127
  opt.on("-v", "--verbose",
128
128
  "Print additional information to STDERR."){ o[:q]=false }
129
- opt.on("-d", "--debug INT", "Print debugging information to STDERR.") do |v|
129
+ opt.on("-d", "--debug INT",
130
+ "Print debugging information to STDERR (1: debug, 2: trace).") do |v|
130
131
  v.to_i>1 ? MiGA::MiGA.DEBUG_TRACE_ON : MiGA::MiGA.DEBUG_ON
131
132
  end
132
133
  opt.on("-h", "--help", "Display this screen.") do
data/lib/miga/daemon.rb CHANGED
@@ -12,11 +12,11 @@ class MiGA::Daemon < MiGA::MiGA
12
12
 
13
13
  ##
14
14
  # When was the last time a daemon for the MiGA::Project +project+ was seen
15
- # active? Returns DateTime.
15
+ # active? Returns Time.
16
16
  def self.last_alive(project)
17
17
  f = File.expand_path('daemon/alive', project.path)
18
18
  return nil unless File.exist? f
19
- DateTime.parse(File.read(f))
19
+ Time.parse(File.read(f))
20
20
  end
21
21
 
22
22
  # Array of all spawned daemons.
@@ -49,7 +49,7 @@ class MiGA::Daemon < MiGA::MiGA
49
49
 
50
50
  ##
51
51
  # When was the last time a daemon for the current project was seen active?
52
- # Returns DateTime.
52
+ # Returns Time.
53
53
  def last_alive
54
54
  MiGA::Daemon.last_alive project
55
55
  end
@@ -229,6 +229,10 @@ class MiGA::Daemon < MiGA::MiGA
229
229
  @loop_i += 1
230
230
  check_datasets
231
231
  check_project
232
+ if shutdown_when_done? and jobs_running.size + jobs_to_run.size == 0
233
+ say 'Nothing else to do, shutting down.'
234
+ return false
235
+ end
232
236
  flush!
233
237
  if loop_i==4
234
238
  say 'Housekeeping for sanity'
@@ -237,10 +241,6 @@ class MiGA::Daemon < MiGA::MiGA
237
241
  end
238
242
  report_status
239
243
  sleep(latency)
240
- if shutdown_when_done? and jobs_running.size+jobs_to_run.size == 0
241
- say 'Nothing else to do, shutting down.'
242
- return false
243
- end
244
244
  true
245
245
  end
246
246
 
@@ -215,7 +215,7 @@ module MiGA::Dataset::Result
215
215
  r.clean! if opts[:is_clean]
216
216
  unless r.clean?
217
217
  MiGA::MiGA.clean_fasta_file(r.file_path :proteins)
218
- MiGA::MiGA.clean_fasta_file(r.file_path :genes)
218
+ MiGA::MiGA.clean_fasta_file(r.file_path :genes) if r.file_path :genes
219
219
  r.clean!
220
220
  end
221
221
  r
@@ -14,13 +14,15 @@ end
14
14
  module MiGA::RemoteDataset::Base
15
15
 
16
16
  @@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
17
+ @@_NCBI_API_KEY = lambda { |url|
18
+ ENV['NCBI_API_KEY'].nil? ? url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
17
19
 
18
20
  ##
19
21
  # Structure of the different database Universes or containers. The structure
20
22
  # is a Hash with universe names as keys as Symbol and values being a Hash with
21
23
  # supported keys as Symbol:
22
24
  # - +:dbs+ => Hash with keys being the database name and the values a Hash of
23
- # properties such as +stage+, +format+, and +map_to+.
25
+ # properties such as +stage+, +format+, +map_to+, and +getter+.
24
26
  # - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
25
27
  # is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
26
28
  # Additional parameters can be passed to certain functions using the +extra+
@@ -37,21 +39,23 @@ module MiGA::RemoteDataset::Base
37
39
  assembly_gz: {stage: :assembly, format: :fasta_gz},
38
40
  text: {stage: :metadata, format: :text}
39
41
  },
40
- url: "%2$s",
42
+ url: '%2$s',
41
43
  method: :net
42
44
  },
43
45
  ebi: {
44
46
  dbs: { embl: {stage: :assembly, format: :fasta} },
45
- url: "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
47
+ url: 'https://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s',
46
48
  method: :rest
47
49
  },
48
50
  ncbi: {
49
- dbs: { nuccore: {stage: :assembly, format: :fasta} },
51
+ dbs: {
52
+ nuccore: { stage: :assembly, format: :fasta },
53
+ assembly: { stage: :assembly, format: :fasta_gz, getter: :ncbi_asm },
54
+ taxonomy: { stage: :metadata, format: :xml }
55
+ },
50
56
  url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
51
57
  method: :rest,
52
- api_key: lambda { |url|
53
- ENV['NCBI_API_KEY'].nil? ?
54
- url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
58
+ api_key: @@_NCBI_API_KEY
55
59
  },
56
60
  ncbi_map: {
57
61
  dbs: {
@@ -62,9 +66,19 @@ module MiGA::RemoteDataset::Base
62
66
  url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
63
67
  method: :net,
64
68
  map_to_universe: :ncbi,
65
- api_key: lambda { |url|
66
- ENV['NCBI_API_KEY'].nil? ?
67
- url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
69
+ api_key: @@_NCBI_API_KEY
70
+ },
71
+ ncbi_summary: {
72
+ dbs: { assembly: { stage: :metadata, format: :json } },
73
+ url: "#{@@_EUTILS}esummary.fcgi?db=%1$s&id=%2$s&retmode=%3$s",
74
+ method: :rest,
75
+ api_key: @@_NCBI_API_KEY
76
+ },
77
+ ncbi_search: {
78
+ dbs: { assembly: { stage: :metadata, format: :json } },
79
+ url: "#{@@_EUTILS}esearch.fcgi?db=%1$s&term=%2$s&retmode=%3$s",
80
+ method: :rest,
81
+ api_key: @@_NCBI_API_KEY
68
82
  }
69
83
  }
70
84
 
@@ -10,15 +10,22 @@ class MiGA::RemoteDataset
10
10
  # Download data from the +universe+ in the database +db+ with IDs +ids+ and
11
11
  # in +format+. If passed, it saves the result in +file+. Additional
12
12
  # parameters specific to the download method can be passed using +extra+.
13
- # Returns String.
14
- def download(universe, db, ids, format, file = nil, extra = [])
13
+ # Returns String. The +obj+ can also be passed as MiGA::RemoteDataset or
14
+ # MiGA::Dataset.
15
+ def download(universe, db, ids, format, file = nil, extra = [], obj = nil)
15
16
  ids = [ids] unless ids.is_a? Array
16
- case @@UNIVERSE[universe][:method]
17
- when :rest
18
- doc = download_rest(universe, db, ids, format, extra)
19
- when :net
20
- doc = download_net(universe, db, ids, format, extra)
21
- end
17
+ getter = @@UNIVERSE[universe][:dbs][db][:getter] || :download
18
+ method = @@UNIVERSE[universe][:method]
19
+ opts = {
20
+ universe: universe,
21
+ db: db,
22
+ ids: ids,
23
+ format: format,
24
+ file: file,
25
+ extra: extra,
26
+ obj: obj
27
+ }
28
+ doc = send("#{getter}_#{method}", opts)
22
29
  unless file.nil?
23
30
  ofh = File.open(file, 'w')
24
31
  ofh.print doc
@@ -28,20 +35,37 @@ class MiGA::RemoteDataset
28
35
  end
29
36
 
30
37
  ##
31
- # Download data using a REST method from the +universe+ in the database +db+
32
- # with IDs +ids+ and in +format+. Additional URL parameters can be passed
33
- # using +extra+. Returns the doc as String.
34
- def download_rest(universe, db, ids, format, extra = [])
35
- u = @@UNIVERSE[universe]
36
- url = sprintf(u[:url], db, ids.join(","), format, *extra)
38
+ # Download data from NCBI Assembly database using the REST method.
39
+ # Supported +opts+ (Hash) include:
40
+ # +obj+ (mandatory): MiGA::RemoteDataset
41
+ # +ids+ (mandatory): String or Array of String
42
+ # +file+: String, passed to download
43
+ # +extra+: Array, passed to download
44
+ # +format+: String, passed to download
45
+ def ncbi_asm_rest(opts)
46
+ url_dir = opts[:obj].ncbi_asm_json_doc['ftppath_genbank']
47
+ url = "#{url_dir}/#{File.basename url_dir}_genomic.fna.gz"
48
+ download(:web, :assembly_gz, url,
49
+ opts[:format], opts[:file], opts[:extra], opts[:obj])
50
+ end
51
+
52
+ ##
53
+ # Download data using the REST method. Supported +opts+ (Hash) include:
54
+ # +universe+ (mandatory): Symbol
55
+ # +db+ (mandatory): Symbol
56
+ # +ids+ (mandatory): Array of String
57
+ # +format+: String
58
+ # +extra+: Array
59
+ def download_rest(opts)
60
+ u = @@UNIVERSE[opts[:universe]]
61
+ url = sprintf(u[:url],
62
+ opts[:db], opts[:ids].join(','), opts[:format], *opts[:extra])
37
63
  url = u[:api_key][url] unless u[:api_key].nil?
38
64
  download_url url
39
65
  end
40
66
 
41
67
  ##
42
- # Download data using a GET request from the +universe+ in the database +db+
43
- # with IDs +ids+ and in +format+. Additional URL parameters can be passed
44
- # using +extra+. Returns the doc as String.
68
+ # Alias of download_rest
45
69
  alias download_net download_rest
46
70
 
47
71
  ##
@@ -51,6 +75,7 @@ class MiGA::RemoteDataset
51
75
  doc = ''
52
76
  @timeout_try = 0
53
77
  begin
78
+ DEBUG 'GET: ' + url
54
79
  open(url, read_timeout: 600) { |f| doc = f.read }
55
80
  rescue => e
56
81
  @timeout_try += 1
@@ -82,6 +107,6 @@ module MiGA::RemoteDataset::Download
82
107
  # Download data into +file+.
83
108
  def download(file)
84
109
  self.class.download(universe, db, ids,
85
- self.class.UNIVERSE[universe][:dbs][db][:format], file)
110
+ self.class.UNIVERSE[universe][:dbs][db][:format], file, [], self)
86
111
  end
87
112
  end
@@ -8,6 +8,16 @@ require 'miga/remote_dataset/download'
8
8
  class MiGA::RemoteDataset < MiGA::MiGA
9
9
  include MiGA::RemoteDataset::Download
10
10
 
11
+ # Class-level
12
+
13
+ class << self
14
+ def ncbi_asm_acc2id(acc)
15
+ return acc if acc =~ /^\d+$/
16
+ search_doc = JSON.parse download(:ncbi_search, :assembly, acc, :json)
17
+ search_doc['esearchresult']['idlist'].first
18
+ end
19
+ end
20
+
11
21
  # Instance-level
12
22
 
13
23
  ##
@@ -19,6 +29,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
19
29
  attr_reader :ids
20
30
  # Internal metadata hash
21
31
  attr_reader :metadata
32
+ # NCBI Assembly XML document
33
+ @_ncbi_asm_xml_doc = nil
22
34
 
23
35
  ##
24
36
  # Initialize MiGA::RemoteDataset with +ids+ in database +db+ from +universe+.
@@ -33,6 +45,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
33
45
  raise "Unknown Universe: #{@universe}. Try: #{@@UNIVERSE.keys}"
34
46
  @@UNIVERSE[@universe][:dbs].include?(@db) or
35
47
  raise "Unknown Database: #{@db}. Try: #{@@UNIVERSE[@universe][:dbs]}"
48
+ @_ncbi_asm_json_doc = nil
36
49
  # FIXME: Part of the +map_to+ support:
37
50
  # unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
38
51
  # MiGA::RemoteDataset.download
@@ -87,7 +100,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
87
100
  ##
88
101
  # Get NCBI Taxonomy ID.
89
102
  def get_ncbi_taxid
90
- send("get_ncbi_taxid_from_#{universe}")
103
+ origin = (universe == :ncbi and db == :assembly) ? :web : universe
104
+ send("get_ncbi_taxid_from_#{origin}")
91
105
  end
92
106
 
93
107
  ##
@@ -107,6 +121,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
107
121
  # Get NCBI taxonomy as MiGA::Taxonomy.
108
122
  def get_ncbi_taxonomy
109
123
  tax_id = get_ncbi_taxid
124
+ return nil if tax_id.nil?
110
125
  lineage = {}
111
126
  doc = MiGA::RemoteDataset.download(:ncbi, :taxonomy, tax_id, :xml)
112
127
  doc.scan(%r{<Taxon>(.*?)</Taxon>}m).map(&:first).each do |i|
@@ -119,15 +134,24 @@ class MiGA::RemoteDataset < MiGA::MiGA
119
134
  MiGA::Taxonomy.new(lineage)
120
135
  end
121
136
 
137
+ ##
138
+ # Get the JSON document describing an NCBI assembly entry.
139
+ def ncbi_asm_json_doc
140
+ return @_ncbi_asm_json_doc unless @_ncbi_asm_json_doc.nil?
141
+ metadata[:ncbi_asm] ||= ids.first if universe == :ncbi and db == :assembly
142
+ return nil unless metadata[:ncbi_asm]
143
+ ncbi_asm_id = self.class.ncbi_asm_acc2id metadata[:ncbi_asm]
144
+ doc = JSON.parse(
145
+ self.class.download(:ncbi_summary, :assembly, ncbi_asm_id, :json))
146
+ @_ncbi_asm_json_doc = doc['result'][ doc['result']['uids'].first ]
147
+ end
148
+
149
+
122
150
  private
123
151
 
124
152
  def get_ncbi_taxid_from_web
125
- return nil unless metadata[:ncbi_asm]
126
- base_url = 'https://www.ncbi.nlm.nih.gov/assembly'
127
- doc = self.class.download_url(
128
- "#{base_url}/#{metadata[:ncbi_asm]}?report=xml&format=text")
129
- taxid = doc.scan(%r{&lt;Taxid&gt;(\S+)&lt;/Taxid&gt;}).first
130
- taxid.nil? ? taxid : taxid.first
153
+ return nil if ncbi_asm_json_doc.nil?
154
+ ncbi_asm_json_doc['taxid']
131
155
  end
132
156
 
133
157
  def get_ncbi_taxid_from_ncbi
@@ -154,29 +178,28 @@ class MiGA::RemoteDataset < MiGA::MiGA
154
178
  biosample = self.class.ncbi_map(metadata[:ncbi_nuccore],
155
179
  :nuccore, :biosample)
156
180
  return metadata if biosample.nil?
157
- asm = self.class.ncbi_map(biosample,
158
- :biosample, :assembly)
181
+ asm = self.class.ncbi_map(biosample, :biosample, :assembly)
159
182
  metadata[:ncbi_asm] = asm.to_s unless asm.nil?
160
183
  get_type_status_ncbi_asm metadata
161
184
  end
162
185
 
163
186
  def get_type_status_ncbi_asm(metadata)
164
- return metadata if metadata[:ncbi_asm].nil?
165
- doc = CGI.unescapeHTML(self.class.download(:web, :text,
166
- "https://www.ncbi.nlm.nih.gov/assembly/" \
167
- "#{metadata[:ncbi_asm]}?report=xml", :xml)).each_line
168
- from_type = doc.grep(%r{<FromType/?>}).first or return metadata
169
- if from_type =~ %r{<FromType/>}
187
+ return metadata if ncbi_asm_json_doc.nil?
188
+ from_type = ncbi_asm_json_doc['from_type']
189
+ from_type = ncbi_asm_json_doc['fromtype'] if from_type.nil?
190
+ case from_type
191
+ when nil
192
+ # Do nothing
193
+ when ''
170
194
  metadata[:is_type] = false
171
195
  metadata[:is_ref_type] = false
172
- elsif from_type =~ %r{<FromType>(.*)</FromType>}
173
- if $1 == 'assembly from reference material'
174
- metadata[:is_type] = false
175
- metadata[:is_ref_type] = true
176
- else
177
- metadata[:is_type] = true
178
- end
179
- metadata[:type_rel] = $1
196
+ when 'assembly from reference material'
197
+ metadata[:is_type] = false
198
+ metadata[:is_ref_type] = true
199
+ metadata[:type_rel] = from_type
200
+ else
201
+ metadata[:is_type] = true
202
+ metadata[:type_rel] = from_type
180
203
  end
181
204
  metadata
182
205
  end
@@ -7,14 +7,14 @@ module MiGA::Result::Dates
7
7
  include MiGA::Result::Base
8
8
 
9
9
  ##
10
- # Returns the start date of processing as DateTime or +nil+ if it doesn't
10
+ # Returns the start date of processing as Time or +nil+ if it doesn't
11
11
  # exist.
12
12
  def started_at
13
13
  date_at :start
14
14
  end
15
15
 
16
16
  ##
17
- # Returns the end (done) date of processing as DateTime or +nil+ if it doesn't
17
+ # Returns the end (done) date of processing as Time or +nil+ if it doesn't
18
18
  # exist.
19
19
  def done_at
20
20
  date_at :done
@@ -38,7 +38,7 @@ module MiGA::Result::Dates
38
38
  f = path event
39
39
  date = File.read(f) if File.size? f
40
40
  end
41
- date.nil? ? nil : DateTime.parse(date)
41
+ date.nil? ? nil : Time.parse(date)
42
42
  end
43
43
  end
44
44
 
data/lib/miga/version.rb CHANGED
@@ -10,7 +10,7 @@ module MiGA
10
10
  # - Float representing the major.minor version.
11
11
  # - Integer representing gem releases of the current version.
12
12
  # - Integer representing minor changes that require new version number.
13
- VERSION = [0.3, 7, 1]
13
+ VERSION = [0.3, 8, 0]
14
14
 
15
15
  ##
16
16
  # Nickname for the current major.minor version.
@@ -18,7 +18,7 @@ module MiGA
18
18
 
19
19
  ##
20
20
  # Date of the current gem release.
21
- VERSION_DATE = Date.today
21
+ VERSION_DATE = Date.new(2019, 02, 28)
22
22
 
23
23
  ##
24
24
  # Reference of MiGA.
data/test/daemon_test.rb CHANGED
@@ -55,7 +55,7 @@ class DaemonTest < Test::Unit::TestCase
55
55
  out = capture_stdout do
56
56
  d.in_loop
57
57
  end
58
- assert_equal(DateTime, d.last_alive.class)
58
+ assert_equal(Time, d.last_alive.class)
59
59
  assert(out.string =~ /-{20}\n.*MiGA:#{p.name} launched/)
60
60
  2.times{ d.in_loop }
61
61
  assert_equal(3, d.loop_i)
@@ -96,7 +96,7 @@ class DaemonTest < Test::Unit::TestCase
96
96
  d = MiGA::Daemon.new(p)
97
97
  assert_nil(d.last_alive)
98
98
  d.declare_alive
99
- assert(d.last_alive - DateTime.now < 1)
99
+ assert(d.last_alive - Time.now < 1)
100
100
  end
101
101
 
102
102
  def test_options
@@ -68,7 +68,7 @@ module MiGA::DistanceRunner::Database
68
68
  if dataset.is_ref? and project.path == ref_project.path
69
69
  y = data_from_db(
70
70
  target.name, dataset.name, ref_db(metric, target.name), metric)
71
- unless y.nil? or y.first.zero?
71
+ unless y.nil? or y.first.nil? or y.first.zero?
72
72
  # Store a copy
73
73
  data_to_db(dataset.name, target.name, tmp_dbs[metric], metric, y)
74
74
  return y.first
data/utils/subclades.R CHANGED
@@ -48,12 +48,18 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
48
48
  ani.types <- a[,2]
49
49
  names(ani.types) <- a[,1]
50
50
  if(length(ani.d) == 0) load(dist_rdata)
51
- }else{
51
+ }else if(length(labels(ani.d)) > 8L){
52
52
  res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
53
53
  if(length(res) == 0) return(NULL)
54
54
  ani.medoids <- res[['ani.medoids']]
55
55
  ani.types <- res[['ani.types']]
56
56
  ani.d <- res[['ani.d']]
57
+ }else{
58
+ ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
59
+ ani.types <- rep(1, length(labels(ani.d)))
60
+ names(ani.types) <- labels(ani.d)
61
+ generate_empty_files(out_base)
62
+ write_text_report(out_base, ani.d, ani.medoids, ani.types)
57
63
  }
58
64
 
59
65
  # Recursive search
@@ -136,16 +142,7 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
136
142
  dev.off()
137
143
 
138
144
  # Save results
139
- say("Text report")
140
- write.table(ani.medoids, paste(out_base, "medoids", sep="."),
141
- quote=FALSE, col.names=FALSE, row.names=FALSE)
142
- classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
143
- ani.d.m <- 100 - as.matrix(ani.d)*100
144
- for(j in 1:nrow(classif)){
145
- classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
146
- }
147
- write.table(classif, paste(out_base,"classif",sep="."),
148
- quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
145
+ write_text_report(out_base, ani.d, ani.medoids, ani.types)
149
146
 
150
147
  # Return data
151
148
  say("Cluster ready")
@@ -168,6 +165,19 @@ generate_empty_files <- function(out_base) {
168
165
  file.create(paste(out_base,".1.medoids",sep=""))
169
166
  }
170
167
 
168
+ write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
169
+ say("Text report")
170
+ write.table(ani.medoids, paste(out_base, "medoids", sep="."),
171
+ quote=FALSE, col.names=FALSE, row.names=FALSE)
172
+ classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
173
+ ani.d.m <- 100 - as.matrix(ani.d)*100
174
+ for(j in 1:nrow(classif)){
175
+ classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
176
+ }
177
+ write.table(classif, paste(out_base,"classif",sep="."),
178
+ quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
179
+ }
180
+
171
181
  plot_silhouette <- function(k, s, ns, ds, top.n) {
172
182
  # s
173
183
  par(mar=c(4,5,1,5)+0.1)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: miga-base
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.7.1
4
+ version: 0.3.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Luis M. Rodriguez-R
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-02-08 00:00:00.000000000 Z
11
+ date: 2019-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: daemons