RubyGems - miga-base - Versions diffs - 0.3.7.1 → 0.3.8.0 - Mend

miga-base 0.3.7.1 → 0.3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/README.md +4 -3
data/actions/init.rb +3 -3
data/actions/ncbi_get.rb +82 -87
data/bin/miga +2 -1
data/lib/miga/daemon.rb +7 -7
data/lib/miga/dataset/result.rb +1 -1
data/lib/miga/remote_dataset/base.rb +24 -10
data/lib/miga/remote_dataset/download.rb +43 -18
data/lib/miga/remote_dataset.rb +46 -23
data/lib/miga/result/dates.rb +3 -3
data/lib/miga/version.rb +2 -2
data/test/daemon_test.rb +2 -2
data/utils/distance/database.rb +1 -1
data/utils/subclades.R +21 -11
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: aeb46208fbfdb522754876da59589895784fb72de06afdded9000066f34c569d
-  data.tar.gz: 61d2afe7e630ebc4635b38f3ee72ba89e7faebfaec502f130bb9bcadc970b951
+  metadata.gz: c7f7bb9ba42ccdbed81ce05484031e17a43c6e688e89c5622327aadbee9d0f31
+  data.tar.gz: 3ca5e3189bb65b213fe43a948710d69ea304dac4c838c176838773fc87a88c5b
 SHA512:
-  metadata.gz: 7ce52a87fa8f5fa8609546b6d98d9f8e49c139ad8894f45954cede0b3ce2dea7a7035dbdf308a0fb0478ea630e51d86476cc67102182091151e21fc6e6c698f3
-  data.tar.gz: e108be9b315fe8315278e2ff22fbf86e3896c01e132dedb0f0e2183e0415626beece1ea4d82318b5365b4b84a60294a9728171ef37efff2650b8f39c062f28cc
+  metadata.gz: 9fa0deb9770be85a71145fcedd3a8a590ad7a6584f274245ed47df7decde6ddcb8d48c635585773a7005ceca4b03d37e67620346faff3eea387803d113a726f8
+  data.tar.gz: 17eeaad6dc985ff89d813ece94d3e2837fac4a40223cde14063b3236ba5dbd7ff581c6e7df30a4919cb002dd6918a42ac820cf8a3045f99714486f05bbc252fe

data/README.md CHANGED Viewed

@@ -11,9 +11,10 @@
 For additional information on MiGA, visit:
+* [MiGA Online][miga-online]: The Microbial Genomes Atlas Online.
 * [MiGA users list][mailing-list]:
   Forum to discuss with other users and developers.
-* [MiGA manual][gitbook]: The definitive guide to MiGA.
+* [MiGA manual][manual]: The definitive guide to MiGA.
 * [MiGA API docs][rubydoc]: Inner-workings of the `miga-base` gem.
 * [MiGA Web][miga-web]: MiGA on Rails!
@@ -46,8 +47,8 @@ Technology and [RDP][rdp] at Michigan State University.
 See [LICENSE](LICENSE).
 [lrr]: http://lmrodriguezr.github.io/
-[mailing-list]: https://groups.google.com/forum/#!forum/miga-users
-[gitbook]: https://miga.gitbooks.io/miga/content/
+[mailing-list]: http://support.microbial-genomes.org/
+[manual]: https://manual.microbial-genomes.org/
 [rubydoc]: http://www.rubydoc.info/github/bio-miga/miga
 [contact]: http://enve-omics.gatech.edu/node/7
 [miga-web]: https://github.com/bio-miga/miga-web

data/actions/init.rb CHANGED Viewed

@@ -229,7 +229,7 @@ unless File.exist?(daemon_f) and ask_user(
       v[:latency] = ask_user(
             'How long should I sleep? (in seconds)', '150').to_i
       v[:maxjobs] = ask_user('How many jobs can I launch at once?', '300').to_i
-      v[:ppn]     = ask_user('How many CPUs can I use per job?', '4').to_i
+      v[:ppn]     = ask_user('How many CPUs can I use per job?', '2').to_i
       $stderr.puts 'Setting up internal daemon defaults.'
       $stderr.puts 'If you don\'t understand this just leave default values:'
       v[:cmd]     = ask_user(
@@ -245,7 +245,7 @@ unless File.exist?(daemon_f) and ask_user(
       v[:alive] = ask_user(
         "How can I know that a process is still alive?\n  %1$s: job id, " +
         "output should be 1 for running and 0 for non-running.\n",
-        "squeue -h -o %t -j '%1$s' | grep '^PD\|R\|CF\|CG$' " +
+        "squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " +
         "| tail -n 1 | wc -l")
       v[:kill]    = ask_user(
         "How should I terminate tasks?\n  %s: process ID.", "scancel '%s'")
@@ -254,7 +254,7 @@ unless File.exist?(daemon_f) and ask_user(
       v[:latency] = ask_user(
             'How long should I sleep? (in seconds)', '150').to_i
       v[:maxjobs] = ask_user('How many jobs can I launch at once?', '300').to_i
-      v[:ppn]     = ask_user('How many CPUs can I use per job?', '4').to_i
+      v[:ppn]     = ask_user('How many CPUs can I use per job?', '2').to_i
       $stderr.puts 'Setting up internal daemon defaults.'
       $stderr.puts 'If you don\'t understand this just leave default values:'
       v[:cmd]     = ask_user(

data/actions/ncbi_get.rb CHANGED Viewed

@@ -4,26 +4,25 @@
 # @license Artistic-2.0
 require 'miga/remote_dataset'
+require 'csv'
 o = {q:true, query:false, unlink:false,
-      reference: false, ignore_plasmids: false,
-      complete: false, chromosome: false,
-      scaffold: false, contig: false, add_version: true, dry: false,
-      get_md: false}
+  reference: false, legacy_name: false,
+  complete: false, chromosome: false,
+  scaffold: false, contig: false, add_version: true, dry: false,
+  get_md: false}
 OptionParser.new do |opt|
   opt_banner(opt)
   opt_object(opt, o, [:project])
   opt.on('-T', '--taxon STRING',
-        '(Mandatory unless --reference) Taxon name (e.g., a species binomial).'
-        ){ |v| o[:taxon]=v }
+    '(Mandatory unless --reference) Taxon name (e.g., a species binomial).'
+    ){ |v| o[:taxon]=v }
   opt.on('--reference',
-        'Download all reference genomes (ignores -T).'){ |v| o[:reference]=v }
-  opt.on('--ref-no-plasmids',
-        'If passed, ignores plasmids (only for --reference).'
-        ){ |v| o[:ignore_plasmids]=v }
+    'Download all reference genomes (ignores any other status).'
+    ){ |v| o[:reference]=v }
   opt.on('--complete', 'Download complete genomes.'){ |v| o[:complete]=v }
   opt.on('--chromosome',
-        'Download complete chromosomes.'){ |v| o[:chromosome]=v }
+    'Download complete chromosomes.'){ |v| o[:chromosome]=v }
   opt.on('--scaffold', 'Download genomes in scaffolds.'){ |v| o[:scaffold]=v }
   opt.on('--contig', 'Download genomes in contigs.'){ |v| o[:contig]=v }
   opt.on('--all', 'Download all genomes (in any status).') do
@@ -33,23 +32,26 @@ OptionParser.new do |opt|
     o[:contig] = true
   end
   opt.on('--no-version-name',
-        'Do not add sequence version to the dataset name.',
-        'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
+    'Do not add sequence version to the dataset name.',
+    'Only affects --complete and --chromosome.'){ |v| o[:add_version]=v }
+  opt.on('--legacy-name',
+    'Use dataset names based on chromosome entries instead of assembly.'
+    ){ |v| o[:legacy_name] = v }
   opt.on('--blacklist PATH',
-        'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
+    'A file with dataset names to blacklist.'){ |v| o[:blacklist] = v }
   opt.on('--dry', 'Do not download or save the datasets.'){ |v| o[:dry] = v }
   opt.on('--get-metadata',
-        'Only download and update metadata for existing datasets'
-        ){ |v| o[:get_md] = v }
+    'Only download and update metadata for existing datasets'
+    ){ |v| o[:get_md] = v }
   opt.on('-q', '--query',
-        'Register the datasets as queries, not reference datasets.'
-        ){ |v| o[:query]=v }
+    'Register the datasets as queries, not reference datasets.'
+    ){ |v| o[:query]=v }
   opt.on('-u', '--unlink',
-        'Unlink all datasets in the project missing from the download list.'
-        ){ |v| o[:unlink]=v }
+    'Unlink all datasets in the project missing from the download list.'
+    ){ |v| o[:unlink]=v }
   opt.on('-R', '--remote-list PATH',
-        'Path to an output file with the list of all datasets listed remotely.'
-        ){ |v| o[:remote_list]=v }
+    'Path to an output file with the list of all datasets listed remotely.'
+    ){ |v| o[:remote_list]=v }
   opt.on('--api-key STRING', 'NCBI API key.'){ |v| ENV['NCBI_API_KEY'] = v }
   opt_common(opt, o)
 end.parse!
@@ -68,85 +70,78 @@ d = []
 ds = {}
 downloaded = 0
-def get_list(taxon, status)
-  url_base = 'https://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?'
-  url_param = if status==:reference
-    { action: 'refgenomes', download: 'on' }
-  else
-    { action: 'download', report: 'proks', group: '-- All Prokaryotes --',
-          subgroup: '-- All Prokaryotes --', orgn: "#{taxon}[orgn]",
-          status: status }
-  end
-  url = url_base + URI.encode_www_form(url_param)
-  MiGA::RemoteDataset.download_url url
-end
-# Download IDs with reference status
+url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
+url_param = {
+  q: '[display()].' +
+    'from(GenomeAssemblies).' +
+    'usingschema(/schema/GenomeAssemblies).' +
+    'matching(tab==["Prokaryotes"] and q=="' + o[:taxon].tr('"',"'") + '"',
+  fields: 'organism|organism,assembly|assembly,replicons|replicons,' +
+    'level|level,ftp_path_genbank|ftp_path_genbank,release_date|release_date,' +
+    'strain|strain',
+  nolimit: 'on',
+}
 if o[:reference]
-  $stderr.puts 'Downloading reference genomes' unless o[:q]
-  lineno = 0
-  get_list(nil, :reference).each_line do |ln|
-    next if (lineno+=1)==1
-    r = ln.chomp.split("\t")
-    next if r[3].nil? or r[3].empty?
-    ids = r[3].split(',')
-    ids += r[5].split(',') unless o[:ignore_plasmids] or r[5].empty?
-    ids.delete_if{ |i| i =~ /\A\-*\z/ }
-    next if ids.empty?
-    n = r[2].miga_name
-    ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
-  end
+  url_param[:q] += ' and refseq_category==["representative"]'
+else
+  status = {
+    complete: 'Complete',
+    chromosome: ' Chromosome', # <- The leading space is *VERY* important!
+    scaffold: 'Scaffold',
+    contig: 'Contig'
+  }.map { |k, v| '"' + v + '"' if o[k] }.compact.join(',')
+  url_param[:q] += ' and level==[' + status + ']'
 end
+url_param[:q] += ')'
+url = url_base + URI.encode_www_form(url_param)
+$stderr.puts 'Downloading genome list' unless o[:q]
+lineno = 0
+doc = MiGA::RemoteDataset.download_url(url)
+CSV.parse(doc, headers: true).each do |r|
+  asm = r['assembly']
+  next if asm.nil? or asm.empty? or asm == '-'
-# Download IDs with complete or chromosome status
-if o[:complete] or o[:chromosome]
-  status = (o[:complete] and o[:chromosome] ?
-        '50|40' : o[:complete] ? '50' : '40')
-  $stderr.puts 'Downloading complete/chromosome genomes' unless o[:q]
-  lineno = 0
-  get_list(o[:taxon], status).each_line do |ln|
-    next if (lineno+=1)==1
-    r = ln.chomp.split("\t")
-    next if r[10].nil? or r[10].empty?
-    ids = r[10].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').split(';')
-    ids.delete_if{ |i| i =~ /\A\-*\z/ }
-    next if ids.empty?
-    acc = o[:add_version] ? ids[0] : ids[0].gsub(/\.\d+\Z/,'')
-    n = "#{r[0]}_#{acc}".miga_name
-    ds[n] = {ids: ids, md: {type: :genome}, db: :nuccore, universe: :ncbi}
-  end
-end
+  # Get replicons
+  rep = r['replicons'].nil? ? nil : r['replicons'].
+      split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
-# Download IDs with scaffold or contig status
-if o[:scaffold] or o[:contig]
-  status = (o[:scaffold] and o[:contig] ? '30|20' : o[:scaffold] ? '30' : '20')
-  $stderr.puts "Downloading scaffold/contig genomes" unless o[:q]
-  lineno = 0
-  get_list(o[:taxon], status).each_line do |ln|
-    next if (lineno+=1)==1
-    r = ln.chomp.split("\t")
-    next if r[7].nil? or r[7].empty?
-    next if r[19].nil? or r[19].empty?
-    asm = r[7].gsub(/[^:;]*:/,'').gsub(/\/[^\/;]*/,'').gsub(/\s/,'')
-    ids = r[19].gsub(/\s/,'').split(';').delete_if{ |i| i =~ /\A\-*\z/ }.
-          map{ |i| "#{i}/#{File.basename(i)}_genomic.fna.gz" }
-    next if ids.empty?
-    n = "#{r[0]}_#{asm}".miga_name
-    asm.gsub!(/\(.*\)/, '')
-    ds[n] = {ids: ids, md: {type: :genome, ncbi_asm: asm},
-          db: :assembly_gz, universe: :web}
+  # Set name
+  if o[:legacy_name] and o[:reference]
+    n = r['#organism'].miga_name
+  else
+    if o[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
+      acc = rep.nil? ? '' : rep.first
+    else
+      acc = asm
+    end
+    acc.gsub!(/\.\d+\Z/, '') unless o[:add_version]
+    n = "#{r['#organism']}_#{acc}".miga_name
   end
+  # Register for download
+  fna_url = r['ftp_path_genbank'] + '/' +
+    File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
+  ds[n] = {
+    ids: [fna_url], db: :assembly_gz, universe: :web,
+    md: {
+      type: :genome, ncbi_asm: asm, strain: r['strain']
+    }
+  }
+  ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
+  ds[n][:md][:release_date] =
+    Time.parse(r['release_date']).to_s unless r['release_date'].nil?
 end
 # Discard blacklisted
 unless o[:blacklist].nil?
   $stderr.puts "Discarding datasets in #{o[:blacklist]}." unless o[:q]
-  File.readlines(o[:blacklist]).map(&:chomp).each{ |i| ds.delete i }
+  File.readlines(o[:blacklist]).
+    select{ |i| i !~ /^#/ }.map(&:chomp).each{ |i| ds.delete i }
 end
 # Download entries
 $stderr.puts "Downloading #{ds.size} " +
-  (ds.size > 1 ? "entries" : "entry") unless o[:q]
+  (ds.size == 1 ? "entry" : "entries") unless o[:q]
 ds.each do |name,body|
   d << name
   puts name

data/bin/miga CHANGED Viewed

@@ -126,7 +126,8 @@ end
 def opt_common(opt, o)
   opt.on("-v", "--verbose",
     "Print additional information to STDERR."){ o[:q]=false }
-  opt.on("-d", "--debug INT", "Print debugging information to STDERR.") do |v|
+  opt.on("-d", "--debug INT",
+    "Print debugging information to STDERR (1: debug, 2: trace).") do |v|
     v.to_i>1 ? MiGA::MiGA.DEBUG_TRACE_ON : MiGA::MiGA.DEBUG_ON
   end
   opt.on("-h", "--help", "Display this screen.") do

data/lib/miga/daemon.rb CHANGED Viewed

@@ -12,11 +12,11 @@ class MiGA::Daemon < MiGA::MiGA
   ##
   # When was the last time a daemon for the MiGA::Project +project+ was seen
-  # active? Returns DateTime.
+  # active? Returns Time.
   def self.last_alive(project)
     f = File.expand_path('daemon/alive', project.path)
     return nil unless File.exist? f
-    DateTime.parse(File.read(f))
+    Time.parse(File.read(f))
   end
   # Array of all spawned daemons.
@@ -49,7 +49,7 @@ class MiGA::Daemon < MiGA::MiGA
   ##
   # When was the last time a daemon for the current project was seen active?
-  # Returns DateTime.
+  # Returns Time.
   def last_alive
     MiGA::Daemon.last_alive project
   end
@@ -229,6 +229,10 @@ class MiGA::Daemon < MiGA::MiGA
     @loop_i += 1
     check_datasets
     check_project
+    if shutdown_when_done? and jobs_running.size + jobs_to_run.size == 0
+      say 'Nothing else to do, shutting down.'
+      return false
+    end
     flush!
     if loop_i==4
       say 'Housekeeping for sanity'
@@ -237,10 +241,6 @@ class MiGA::Daemon < MiGA::MiGA
     end
     report_status
     sleep(latency)
-    if shutdown_when_done? and jobs_running.size+jobs_to_run.size == 0
-      say 'Nothing else to do, shutting down.'
-      return false
-    end
     true
   end

data/lib/miga/dataset/result.rb CHANGED Viewed

@@ -215,7 +215,7 @@ module MiGA::Dataset::Result
       r.clean! if opts[:is_clean]
       unless r.clean?
         MiGA::MiGA.clean_fasta_file(r.file_path :proteins)
-        MiGA::MiGA.clean_fasta_file(r.file_path :genes)
+        MiGA::MiGA.clean_fasta_file(r.file_path :genes) if r.file_path :genes
         r.clean!
       end
       r

data/lib/miga/remote_dataset/base.rb CHANGED Viewed

@@ -14,13 +14,15 @@ end
 module MiGA::RemoteDataset::Base
   @@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
+  @@_NCBI_API_KEY = lambda { |url|
+    ENV['NCBI_API_KEY'].nil? ? url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
   ##
   # Structure of the different database Universes or containers. The structure
   # is a Hash with universe names as keys as Symbol and values being a Hash with
   # supported keys as Symbol:
   # - +:dbs+ => Hash with keys being the database name and the values a Hash of
-  #   properties such as +stage+, +format+, and +map_to+.
+  #   properties such as +stage+, +format+, +map_to+, and +getter+.
   # - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
   #   is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
   #   Additional parameters can be passed to certain functions using the +extra+
@@ -37,21 +39,23 @@ module MiGA::RemoteDataset::Base
         assembly_gz: {stage: :assembly, format: :fasta_gz},
         text: {stage: :metadata, format: :text}
       },
-      url: "%2$s",
+      url: '%2$s',
       method: :net
     },
     ebi: {
       dbs: { embl: {stage: :assembly, format: :fasta} },
-      url: "https://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s",
+      url: 'https://www.ebi.ac.uk/Tools/dbfetch/dbfetch/%1$s/%2$s/%3$s',
       method: :rest
     },
     ncbi: {
-      dbs: { nuccore: {stage: :assembly, format: :fasta} },
+      dbs: {
+        nuccore: { stage: :assembly, format: :fasta },
+        assembly: { stage: :assembly, format: :fasta_gz, getter: :ncbi_asm },
+        taxonomy: { stage: :metadata, format: :xml }
+      },
       url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
       method: :rest,
-      api_key: lambda { |url|
-        ENV['NCBI_API_KEY'].nil? ?
-          url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
+      api_key: @@_NCBI_API_KEY
     },
     ncbi_map: {
       dbs: {
@@ -62,9 +66,19 @@ module MiGA::RemoteDataset::Base
       url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
       method: :net,
       map_to_universe: :ncbi,
-      api_key: lambda { |url|
-        ENV['NCBI_API_KEY'].nil? ?
-          url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}" }
+      api_key: @@_NCBI_API_KEY
+    },
+    ncbi_summary: {
+      dbs: { assembly: { stage: :metadata, format: :json } },
+      url: "#{@@_EUTILS}esummary.fcgi?db=%1$s&id=%2$s&retmode=%3$s",
+      method: :rest,
+      api_key: @@_NCBI_API_KEY
+    },
+    ncbi_search: {
+      dbs: { assembly: { stage: :metadata, format: :json } },
+      url: "#{@@_EUTILS}esearch.fcgi?db=%1$s&term=%2$s&retmode=%3$s",
+      method: :rest,
+      api_key: @@_NCBI_API_KEY
     }
   }

data/lib/miga/remote_dataset/download.rb CHANGED Viewed

@@ -10,15 +10,22 @@ class MiGA::RemoteDataset
     # Download data from the +universe+ in the database +db+ with IDs +ids+ and
     # in +format+. If passed, it saves the result in +file+. Additional
     # parameters specific to the download method can be passed using +extra+.
-    # Returns String.
-    def download(universe, db, ids, format, file = nil, extra = [])
+    # Returns String. The +obj+ can also be passed as MiGA::RemoteDataset or
+    # MiGA::Dataset.
+    def download(universe, db, ids, format, file = nil, extra = [], obj = nil)
       ids = [ids] unless ids.is_a? Array
-      case @@UNIVERSE[universe][:method]
-      when :rest
-        doc = download_rest(universe, db, ids, format, extra)
-      when :net
-        doc = download_net(universe, db, ids, format, extra)
-      end
+      getter = @@UNIVERSE[universe][:dbs][db][:getter] || :download
+      method = @@UNIVERSE[universe][:method]
+      opts = {
+        universe: universe,
+        db: db,
+        ids: ids,
+        format: format,
+        file: file,
+        extra: extra,
+        obj: obj
+      }
+      doc = send("#{getter}_#{method}", opts)
       unless file.nil?
         ofh = File.open(file, 'w')
         ofh.print doc
@@ -28,20 +35,37 @@ class MiGA::RemoteDataset
     end
     ##
-    # Download data using a REST method from the +universe+ in the database +db+
-    # with IDs +ids+ and in +format+. Additional URL parameters can be passed
-    # using +extra+. Returns the doc as String.
-    def download_rest(universe, db, ids, format, extra = [])
-      u = @@UNIVERSE[universe]
-      url = sprintf(u[:url], db, ids.join(","), format, *extra)
+    # Download data from NCBI Assembly database using the REST method.
+    # Supported +opts+ (Hash) include:
+    # +obj+ (mandatory): MiGA::RemoteDataset
+    # +ids+ (mandatory): String or Array of String
+    # +file+: String, passed to download
+    # +extra+: Array, passed to download
+    # +format+: String, passed to download
+    def ncbi_asm_rest(opts)
+      url_dir = opts[:obj].ncbi_asm_json_doc['ftppath_genbank']
+      url = "#{url_dir}/#{File.basename url_dir}_genomic.fna.gz"
+      download(:web, :assembly_gz, url,
+        opts[:format], opts[:file], opts[:extra], opts[:obj])
+    end
+    ##
+    # Download data using the REST method. Supported +opts+ (Hash) include:
+    # +universe+ (mandatory): Symbol
+    # +db+ (mandatory): Symbol
+    # +ids+ (mandatory): Array of String
+    # +format+: String
+    # +extra+: Array
+    def download_rest(opts)
+      u = @@UNIVERSE[opts[:universe]]
+      url = sprintf(u[:url],
+        opts[:db], opts[:ids].join(','), opts[:format], *opts[:extra])
       url = u[:api_key][url] unless u[:api_key].nil?
       download_url url
     end
     ##
-    # Download data using a GET request from the +universe+ in the database +db+
-    # with IDs +ids+ and in +format+. Additional URL parameters can be passed
-    # using +extra+. Returns the doc as String.
+    # Alias of download_rest
     alias download_net download_rest
     ##
@@ -51,6 +75,7 @@ class MiGA::RemoteDataset
       doc = ''
       @timeout_try = 0
       begin
+        DEBUG 'GET: ' + url
         open(url, read_timeout: 600) { |f| doc = f.read }
       rescue => e
         @timeout_try += 1
@@ -82,6 +107,6 @@ module MiGA::RemoteDataset::Download
   # Download data into +file+.
   def download(file)
     self.class.download(universe, db, ids,
-          self.class.UNIVERSE[universe][:dbs][db][:format], file)
+          self.class.UNIVERSE[universe][:dbs][db][:format], file, [], self)
   end
 end

data/lib/miga/remote_dataset.rb CHANGED Viewed

@@ -8,6 +8,16 @@ require 'miga/remote_dataset/download'
 class MiGA::RemoteDataset < MiGA::MiGA
   include MiGA::RemoteDataset::Download
+  # Class-level
+  class << self
+    def ncbi_asm_acc2id(acc)
+      return acc if acc =~ /^\d+$/
+      search_doc = JSON.parse download(:ncbi_search, :assembly, acc, :json)
+      search_doc['esearchresult']['idlist'].first
+    end
+  end
   # Instance-level
   ##
@@ -19,6 +29,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
   attr_reader :ids
   # Internal metadata hash
   attr_reader :metadata
+  # NCBI Assembly XML document
+  @_ncbi_asm_xml_doc = nil
   ##
   # Initialize MiGA::RemoteDataset with +ids+ in database +db+ from +universe+.
@@ -33,6 +45,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
       raise "Unknown Universe: #{@universe}. Try: #{@@UNIVERSE.keys}"
     @@UNIVERSE[@universe][:dbs].include?(@db) or
       raise "Unknown Database: #{@db}. Try: #{@@UNIVERSE[@universe][:dbs]}"
+    @_ncbi_asm_json_doc = nil
     # FIXME: Part of the +map_to+ support:
     # unless @@UNIVERSE[@universe][:dbs][@db][:map_to].nil?
     #   MiGA::RemoteDataset.download
@@ -87,7 +100,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
   ##
   # Get NCBI Taxonomy ID.
   def get_ncbi_taxid
-    send("get_ncbi_taxid_from_#{universe}")
+    origin = (universe == :ncbi and db == :assembly) ? :web : universe
+    send("get_ncbi_taxid_from_#{origin}")
   end
   ##
@@ -107,6 +121,7 @@ class MiGA::RemoteDataset < MiGA::MiGA
   # Get NCBI taxonomy as MiGA::Taxonomy.
   def get_ncbi_taxonomy
     tax_id = get_ncbi_taxid
+    return nil if tax_id.nil?
     lineage = {}
     doc = MiGA::RemoteDataset.download(:ncbi, :taxonomy, tax_id, :xml)
     doc.scan(%r{<Taxon>(.*?)</Taxon>}m).map(&:first).each do |i|
@@ -119,15 +134,24 @@ class MiGA::RemoteDataset < MiGA::MiGA
     MiGA::Taxonomy.new(lineage)
   end
+  ##
+  # Get the JSON document describing an NCBI assembly entry.
+  def ncbi_asm_json_doc
+    return @_ncbi_asm_json_doc unless @_ncbi_asm_json_doc.nil?
+    metadata[:ncbi_asm] ||= ids.first if universe == :ncbi and db == :assembly
+    return nil unless metadata[:ncbi_asm]
+    ncbi_asm_id = self.class.ncbi_asm_acc2id metadata[:ncbi_asm]
+    doc = JSON.parse(
+      self.class.download(:ncbi_summary, :assembly, ncbi_asm_id, :json))
+    @_ncbi_asm_json_doc = doc['result'][ doc['result']['uids'].first ]
+  end
   private
     def get_ncbi_taxid_from_web
-      return nil unless metadata[:ncbi_asm]
-      base_url = 'https://www.ncbi.nlm.nih.gov/assembly'
-      doc = self.class.download_url(
-        "#{base_url}/#{metadata[:ncbi_asm]}?report=xml&format=text")
-      taxid = doc.scan(%r{&lt;Taxid&gt;(\S+)&lt;/Taxid&gt;}).first
-      taxid.nil? ? taxid : taxid.first
+      return nil if ncbi_asm_json_doc.nil?
+      ncbi_asm_json_doc['taxid']
     end
     def get_ncbi_taxid_from_ncbi
@@ -154,29 +178,28 @@ class MiGA::RemoteDataset < MiGA::MiGA
       biosample = self.class.ncbi_map(metadata[:ncbi_nuccore],
         :nuccore, :biosample)
       return metadata if biosample.nil?
-      asm = self.class.ncbi_map(biosample,
-        :biosample, :assembly)
+      asm = self.class.ncbi_map(biosample, :biosample, :assembly)
       metadata[:ncbi_asm] = asm.to_s unless asm.nil?
       get_type_status_ncbi_asm metadata
     end
     def get_type_status_ncbi_asm(metadata)
-      return metadata if metadata[:ncbi_asm].nil?
-      doc = CGI.unescapeHTML(self.class.download(:web, :text,
-        "https://www.ncbi.nlm.nih.gov/assembly/" \
-          "#{metadata[:ncbi_asm]}?report=xml", :xml)).each_line
-      from_type = doc.grep(%r{<FromType/?>}).first or return metadata
-      if from_type =~ %r{<FromType/>}
+      return metadata if ncbi_asm_json_doc.nil?
+      from_type = ncbi_asm_json_doc['from_type']
+      from_type = ncbi_asm_json_doc['fromtype'] if from_type.nil?
+      case from_type
+      when nil
+        # Do nothing
+      when ''
         metadata[:is_type] = false
         metadata[:is_ref_type] = false
-      elsif from_type =~ %r{<FromType>(.*)</FromType>}
-        if $1 == 'assembly from reference material'
-          metadata[:is_type] = false
-          metadata[:is_ref_type] = true
-        else
-          metadata[:is_type] = true
-        end
-        metadata[:type_rel] = $1
+      when 'assembly from reference material'
+        metadata[:is_type] = false
+        metadata[:is_ref_type] = true
+        metadata[:type_rel] = from_type
+      else
+        metadata[:is_type] = true
+        metadata[:type_rel] = from_type
       end
       metadata
     end

data/lib/miga/result/dates.rb CHANGED Viewed

@@ -7,14 +7,14 @@ module MiGA::Result::Dates
   include MiGA::Result::Base
   ##
-  # Returns the start date of processing as DateTime or +nil+ if it doesn't
+  # Returns the start date of processing as Time or +nil+ if it doesn't
   # exist.
   def started_at
     date_at :start
   end
   ##
-  # Returns the end (done) date of processing as DateTime or +nil+ if it doesn't
+  # Returns the end (done) date of processing as Time or +nil+ if it doesn't
   # exist.
   def done_at
     date_at :done
@@ -38,7 +38,7 @@ module MiGA::Result::Dates
         f = path event
         date = File.read(f) if File.size? f
       end
-      date.nil? ? nil : DateTime.parse(date)
+      date.nil? ? nil : Time.parse(date)
     end
 end

data/lib/miga/version.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module MiGA
   # - Float representing the major.minor version.
   # - Integer representing gem releases of the current version.
   # - Integer representing minor changes that require new version number.
-  VERSION = [0.3, 7, 1]
+  VERSION = [0.3, 8, 0]
   ##
   # Nickname for the current major.minor version.
@@ -18,7 +18,7 @@ module MiGA
   ##
   # Date of the current gem release.
-  VERSION_DATE = Date.today
+  VERSION_DATE = Date.new(2019, 02, 28)
   ##
   # Reference of MiGA.

data/test/daemon_test.rb CHANGED Viewed

@@ -55,7 +55,7 @@ class DaemonTest < Test::Unit::TestCase
     out = capture_stdout do
       d.in_loop
     end
-    assert_equal(DateTime, d.last_alive.class)
+    assert_equal(Time, d.last_alive.class)
     assert(out.string =~ /-{20}\n.*MiGA:#{p.name} launched/)
     2.times{ d.in_loop }
     assert_equal(3, d.loop_i)
@@ -96,7 +96,7 @@ class DaemonTest < Test::Unit::TestCase
     d = MiGA::Daemon.new(p)
     assert_nil(d.last_alive)
     d.declare_alive
-    assert(d.last_alive - DateTime.now < 1)
+    assert(d.last_alive - Time.now < 1)
   end
   def test_options

data/utils/distance/database.rb CHANGED Viewed

@@ -68,7 +68,7 @@ module MiGA::DistanceRunner::Database
     if dataset.is_ref? and project.path == ref_project.path
       y = data_from_db(
         target.name, dataset.name, ref_db(metric, target.name), metric)
-      unless y.nil? or y.first.zero?
+      unless y.nil? or y.first.nil? or y.first.zero?
         # Store a copy
         data_to_db(dataset.name, target.name, tmp_dbs[metric], metric, y)
         return y.first

data/utils/subclades.R CHANGED Viewed

@@ -48,12 +48,18 @@ subclades <- function(ani_file, out_base, thr = 1, ani.d = dist(0), sel = NA) {
     ani.types <- a[,2]
     names(ani.types) <- a[,1]
     if(length(ani.d) == 0) load(dist_rdata)
-  }else{
+  }else if(length(labels(ani.d)) > 8L){
     res <- subclade_clustering(out_base, thr, ani.d, dist_rdata)
     if(length(res) == 0) return(NULL)
     ani.medoids <- res[['ani.medoids']]
     ani.types <- res[['ani.types']]
     ani.d <- res[['ani.d']]
+  }else{
+    ani.medoids <- labels(ani.d)[which.min(colSums(as.matrix(ani.d)))]
+    ani.types <- rep(1, length(labels(ani.d)))
+    names(ani.types) <- labels(ani.d)
+    generate_empty_files(out_base)
+    write_text_report(out_base, ani.d, ani.medoids, ani.types)
   }
   # Recursive search
@@ -136,16 +142,7 @@ subclade_clustering <- function(out_base, thr, ani.d, dist_rdata) {
   dev.off()
   # Save results
-  say("Text report")
-  write.table(ani.medoids, paste(out_base, "medoids", sep="."),
-    quote=FALSE, col.names=FALSE, row.names=FALSE)
-  classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
-  ani.d.m <- 100 - as.matrix(ani.d)*100
-  for(j in 1:nrow(classif)){
-    classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
-  }
-  write.table(classif, paste(out_base,"classif",sep="."),
-    quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
+  write_text_report(out_base, ani.d, ani.medoids, ani.types)
   # Return data
   say("Cluster ready")
@@ -168,6 +165,19 @@ generate_empty_files <- function(out_base) {
   file.create(paste(out_base,".1.medoids",sep=""))
 }
+write_text_report <- function(out_base, ani.d, ani.medoids, ani.types){
+  say("Text report")
+  write.table(ani.medoids, paste(out_base, "medoids", sep="."),
+    quote=FALSE, col.names=FALSE, row.names=FALSE)
+  classif <- cbind(names(ani.types), ani.types, ani.medoids[ ani.types ], NA)
+  ani.d.m <- 100 - as.matrix(ani.d)*100
+  for(j in 1:nrow(classif)){
+    classif[j,4] <- ani.d.m[classif[j,1], classif[j,3]]
+  }
+  write.table(classif, paste(out_base,"classif",sep="."),
+    quote=FALSE, col.names=FALSE, row.names=FALSE, sep="\t")
+}
 plot_silhouette <- function(k, s, ns, ds, top.n) {
   # s
   par(mar=c(4,5,1,5)+0.1)

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: miga-base
 version: !ruby/object:Gem::Version
-  version: 0.3.7.1
+  version: 0.3.8.0
 platform: ruby
 authors:
 - Luis M. Rodriguez-R
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-02-08 00:00:00.000000000 Z
+date: 2019-02-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: daemons