miga-base 1.3.8.2 → 1.3.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -2
  3. data/lib/miga/cli/action/add_result.rb +22 -1
  4. data/lib/miga/cli/action/browse/about.html +4 -2
  5. data/lib/miga/cli/action/download/gtdb.rb +1 -1
  6. data/lib/miga/cli/action/download/ncbi.rb +43 -68
  7. data/lib/miga/cli/action/download/seqcode.rb +1 -2
  8. data/lib/miga/cli/action/ncbi_get.rb +1 -8
  9. data/lib/miga/cli/action/wf.rb +15 -6
  10. data/lib/miga/cli/objects_helper.rb +3 -0
  11. data/lib/miga/cli/opt_helper.rb +8 -2
  12. data/lib/miga/common/net.rb +100 -18
  13. data/lib/miga/dataset/base.rb +40 -12
  14. data/lib/miga/dataset/hooks.rb +8 -0
  15. data/lib/miga/dataset/result/ignore.rb +14 -2
  16. data/lib/miga/dataset/type.rb +51 -0
  17. data/lib/miga/dataset.rb +3 -22
  18. data/lib/miga/json.rb +9 -0
  19. data/lib/miga/project/base.rb +15 -9
  20. data/lib/miga/project.rb +7 -1
  21. data/lib/miga/remote_dataset/base.rb +117 -36
  22. data/lib/miga/remote_dataset/download.rb +121 -54
  23. data/lib/miga/remote_dataset.rb +34 -13
  24. data/lib/miga/result/stats.rb +2 -0
  25. data/lib/miga/result/versions.rb +23 -0
  26. data/lib/miga/result.rb +7 -1
  27. data/lib/miga/taxonomy/base.rb +3 -2
  28. data/lib/miga/version.rb +2 -2
  29. data/scripts/assembly.bash +15 -1
  30. data/scripts/cds.bash +9 -3
  31. data/scripts/distances.bash +103 -5
  32. data/scripts/essential_genes.bash +14 -1
  33. data/scripts/mytaxa.bash +18 -3
  34. data/scripts/mytaxa_scan.bash +16 -3
  35. data/scripts/read_quality.bash +6 -2
  36. data/scripts/ssu.bash +19 -1
  37. data/scripts/stats.bash +9 -3
  38. data/scripts/taxonomy.bash +98 -2
  39. data/scripts/trimmed_fasta.bash +10 -2
  40. data/scripts/trimmed_reads.bash +26 -6
  41. data/test/dataset_test.rb +17 -2
  42. data/test/hook_test.rb +3 -2
  43. data/test/net_test.rb +21 -5
  44. data/test/project_test.rb +13 -0
  45. data/test/remote_dataset_test.rb +106 -7
  46. data/test/result_test.rb +47 -21
  47. data/test/taxonomy_test.rb +9 -3
  48. data/utils/distance/runner.rb +3 -1
  49. data/utils/distances.rb +1 -1
  50. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2e4abdd418e396b20fcfea4beaf4d70f3b0022808714478581e2e1f6f8a85478
4
- data.tar.gz: 881884c4d5a933b64ac29e93397b62b7d1dc30fd599baaa8b6ac04a3ad3ab051
3
+ metadata.gz: 7a4aa208ac4dfe7ff6edbedb7aa7b3444c909c77e47b2ef1b93282adc83192d6
4
+ data.tar.gz: 78bdce8752b3a1a1a281123a99cccdc002d0f95a2a345aa16b1c4ee684220c29
5
5
  SHA512:
6
- metadata.gz: 317022ffe39818af13d36e3bf1e1adea80b3ab3aafb265b3df6371b6b5e87efdb75199cf8bd22aea37a63e941f824d1686f85bdb7a38eda3f4ba2a76c535b86b
7
- data.tar.gz: 0d42dadd982374f0c618d76fc030f5665075e9973ca8e8615c5b090ab1dcc6bf663901a8f7355ce3bfa6874941d9098957d31704b6369474e516dd5ebf8c0b6b
6
+ metadata.gz: 335e7a8715a6c561b618e21d1e273381f88d384daf16360c01f9b6818ec374f05f6ae824a36628fbe4dcb0c7d6e082f633e8372137edfa035389cb92922363fa
7
+ data.tar.gz: 76bdf75348e711bdfcca6b973acd3768559acde7ee80233833ae8b8caa595020599756b7913774dc53c038290592d20710b1b67bd5fa8bb93fd334206bf21b06
data/README.md CHANGED
@@ -12,7 +12,7 @@
12
12
  For additional information on MiGA, visit:
13
13
 
14
14
  * [MiGA Online][miga-online]: The Microbial Genomes Atlas Online
15
- * [MiGA@XSEDE][miga-at-xsede]: The MiGA@XSEDE Gateway
15
+ * [MiGA Gateway][miga-gatewat]: The MiGA Science Gateway
16
16
  * [MiGA users list][mailing-list]:
17
17
  Forum to discuss with other users and developers
18
18
  * [MiGA manual][manual]: The definitive guide to MiGA
@@ -57,6 +57,6 @@ See [LICENSE](LICENSE).
57
57
  [miga-web]: https://github.com/bio-miga/miga-web
58
58
  [miga-gui]: https://github.com/bio-miga/miga-gui
59
59
  [miga-online]: http://microbial-genomes.org/
60
- [miga-at-xsede]: https://xsede.microbial-genomes.org/
60
+ [miga-gateway]: https://gateway.microbial-genomes.org/
61
61
  [kostas]: http://enve-omics.gatech.edu/
62
62
  [rdp]: http://rdp.cme.msu.edu/
@@ -5,13 +5,17 @@ require 'miga/cli/action'
5
5
 
6
6
  class MiGA::Cli::Action::AddResult < MiGA::Cli::Action
7
7
  def parse_cli
8
- cli.defaults = { force: false }
8
+ cli.defaults = { force: false, stdin_versions: false }
9
9
  cli.parse do |opt|
10
10
  cli.opt_object(opt, [:project, :dataset_opt, :result])
11
11
  opt.on(
12
12
  '-f', '--force',
13
13
  'Force re-indexing of the result even if it\'s already registered'
14
14
  ) { |v| cli[:force] = v }
15
+ opt.on(
16
+ '--stdin-versions',
17
+ 'Read Software versions from STDIN'
18
+ ) { |v| cli[:stdin_versions] = v }
15
19
  end
16
20
  end
17
21
 
@@ -21,5 +25,22 @@ class MiGA::Cli::Action::AddResult < MiGA::Cli::Action
21
25
  cli.say "Registering result: #{cli[:result]}"
22
26
  r = obj.add_result(cli[:result], true, force: cli[:force])
23
27
  raise 'Cannot add result, incomplete expected files' if r.nil?
28
+
29
+ # Add Software version data
30
+ if cli[:stdin_versions]
31
+ versions = {}
32
+ sw = nil
33
+ $stdin.each do |ln|
34
+ ln = ln.chomp.strip
35
+ if ln =~ /^=> (.*)/
36
+ sw = $1
37
+ versions[sw] = ''
38
+ else
39
+ versions[sw] += ln
40
+ end
41
+ end
42
+ r.add_versions(versions)
43
+ r.save
44
+ end
24
45
  end
25
46
  end
@@ -12,10 +12,12 @@ terms of the terms of the
12
12
  <p>
13
13
  MiGA is the result of a collaboration between the
14
14
  <a href='http://enve-omics.gatech.edu/'>Kostas Lab</a>
15
- (<a href='http://www.gatech.edu/'>Georgia Institute of Technology</a>) and the
15
+ (<a href='http://www.gatech.edu/'>Georgia Institute of Technology</a>), the
16
16
  <a href='http://rdp.cme.msu.edu/'>RDP team</a>
17
17
  (<a href='http://cme.msu.edu/'>Center for Microbial Ecology</a>,
18
- <a href='https://msu.edu/'>Michigan State University</a>).
18
+ <a href='https://msu.edu/'>Michigan State University</a>), and the
19
+ <a href="https://disc-genomics.uibk.ac.at/">Rodriguez-R lab</a>
20
+ (<a href="https://uibk.ac.at/">University of Innsbruck</a>).
19
21
  The MiGA project is funded by the
20
22
  <a href='http://nsf.gov/'>US National Science Foundation</a>
21
23
  (Awards <a href='http://nsf.gov/awardsearch/showAward?AWD_ID=1356288'>#1356288</a> &amp;
@@ -31,7 +31,7 @@ module MiGA::Cli::Action::Download::Gtdb
31
31
 
32
32
  def remote_list
33
33
  cli.say 'Downloading genome list'
34
- extra = ['sp_reps_only=' + cli[:reference].to_s]
34
+ extra = { sp_reps_only: cli[:reference].to_s }
35
35
  json = MiGA::RemoteDataset.download(
36
36
  :gtdb, :taxon, cli[:taxon], :genomes, nil, extra
37
37
  )
@@ -34,11 +34,8 @@ module MiGA::Cli::Action::Download::Ncbi
34
34
  'Do not add sequence version to the dataset name',
35
35
  'Only affects --complete and --chromosome'
36
36
  ) { |v| cli[:add_version] = v }
37
- cli.opt_flag(
38
- opt, 'legacy-name',
39
- 'Use dataset names based on chromosome entries instead of assembly',
40
- :legacy_name
41
- )
37
+ # For backwards compatibility
38
+ cli.opt_flag(opt, 'legacy-name', '::HIDE::', :legacy_name)
42
39
  end
43
40
 
44
41
  def sanitize_cli
@@ -52,89 +49,67 @@ module MiGA::Cli::Action::Download::Ncbi
52
49
  end
53
50
 
54
51
  def remote_list
55
- doc =
56
- if cli[:ncbi_table_file]
57
- cli.say 'Reading genome list from file'
58
- File.open(cli[:ncbi_table_file], 'r')
59
- else
60
- cli.say 'Downloading genome list'
61
- url = remote_list_url
62
- MiGA::RemoteDataset.download_url(url)
63
- end
64
- ds = parse_csv_as_datasets(doc)
65
- doc.close if cli[:ncbi_table_file]
66
- ds
52
+ list = {}
53
+ query = remote_list_query
54
+ loop do
55
+ # Query the remote collection
56
+ page = MiGA::Json.parse(
57
+ MiGA::RemoteDataset.download(:ncbi_datasets, :genome, query, :json),
58
+ contents: true
59
+ )
60
+ break unless page&.any? && page[:reports]&.any?
61
+
62
+ # Process reports in this page
63
+ list.merge!(parse_reports_as_datasets(page[:reports]))
64
+
65
+ # Next page
66
+ break unless page[:next_page_token]
67
+ query[:page_token] = page[:next_page_token]
68
+ end
69
+ list
67
70
  end
68
71
 
69
- def parse_csv_as_datasets(doc)
72
+ def parse_reports_as_datasets(reports)
70
73
  ds = {}
71
- CSV.parse(doc, headers: true).each do |r|
72
- asm = r['assembly']
74
+ reports.each do |r|
75
+ asm = r[:accession]
73
76
  next if asm.nil? || asm.empty? || asm == '-'
74
77
 
75
- rep = remote_row_replicons(r)
76
- n = remote_row_name(r, rep, asm)
77
-
78
78
  # Register for download
79
+ n = remote_report_name(r, asm)
79
80
  ds[n] = {
80
81
  ids: [asm], db: :assembly, universe: :ncbi,
81
82
  md: {
82
- type: :genome, ncbi_asm: asm, strain: r['strain']
83
+ type: :genome, ncbi_asm: asm, strain: r.dig(:organism, :infraspecific_names, :strain)
83
84
  }
84
85
  }
85
- ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
86
- unless r['release_date'].nil?
87
- ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
88
- end
86
+ date = r.dig(:assembly_info, :release_date)
87
+ ds[n][:md][:release_date] = Time.parse(date).to_s if date
88
+ ds[n][:md][:ncbi_dataset] = r
89
89
  end
90
90
  ds
91
91
  end
92
92
 
93
- def remote_row_replicons(r)
94
- return if r['replicons'].nil?
95
-
96
- r['replicons']
97
- .split('; ')
98
- .map { |i| i.gsub(/.*:/, '') }
99
- .map { |i| i.gsub(%r{/.*}, '') }
100
- end
101
-
102
- def remote_row_name(r, rep, asm)
103
- return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
104
-
105
- if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
106
- acc = rep.nil? ? '' : rep.first
107
- else
108
- acc = asm
109
- end
93
+ def remote_report_name(r, asm)
94
+ acc = "#{asm}"
110
95
  acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
111
- "#{r['#organism']}_#{acc}".miga_name
96
+ org = r.dig(:organism, :organism_name)
97
+ acc = "#{org}_#{acc}" if org
98
+ acc.miga_name
112
99
  end
113
100
 
114
- def remote_list_url
115
- url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
116
- url_param = {
117
- q: '[display()].' \
118
- 'from(GenomeAssemblies).' \
119
- 'usingschema(/schema/GenomeAssemblies).' \
120
- 'matching(tab==["Prokaryotes"] and q=="' \
121
- "#{cli[:taxon]&.tr('"', "'")}\"",
122
- fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
123
- 'level|level,release_date|release_date,strain|strain',
124
- nolimit: 'on'
125
- }
101
+ def remote_list_query
102
+ q = { taxons: [cli[:taxon]], filters: {} }
126
103
  if cli[:reference]
127
- url_param[:q] += ' and refseq_category==["representative"]'
104
+ q[:filters][:reference_only] = true
128
105
  else
129
- status = {
130
- complete: 'Complete',
131
- chromosome: ' Chromosome', # <- The leading space is *VERY* important!
132
- scaffold: 'Scaffold',
133
- contig: 'Contig'
134
- }.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
135
- url_param[:q] += ' and level==[' + status + ']'
106
+ q[:assembly_level] = {
107
+ contig: 'contig',
108
+ scaffold: 'scaffold',
109
+ chromosome: 'chromosome',
110
+ complete: 'complete_genome'
111
+ }.map { |k, v| '"' + v + '"' if cli[k] }.compact
136
112
  end
137
- url_param[:q] += ')'
138
- url_base + URI.encode_www_form(url_param)
113
+ q
139
114
  end
140
115
  end
@@ -29,8 +29,7 @@ module MiGA::Cli::Action::Download::Seqcode
29
29
 
30
30
  while current_page <= total_pages
31
31
  json = MiGA::RemoteDataset.download(
32
- :seqcode, :'type-genomes', nil, :json, nil,
33
- ["page=#{current_page}"]
32
+ :seqcode, :'type-genomes', nil, :json, nil, page: current_page
34
33
  )
35
34
  doc = MiGA::Json.parse(json, contents: true)
36
35
  current_page = doc[:current_page] + 1
@@ -8,8 +8,7 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
8
8
 
9
9
  def parse_cli
10
10
  cli.defaults = {
11
- query: false, unlink: false,
12
- reference: false, legacy_name: false,
11
+ query: false, unlink: false, reference: false,
13
12
  complete: false, chromosome: false,
14
13
  scaffold: false, contig: false, add_version: true, dry: false,
15
14
  get_md: false, only_md: false, save_every: 1
@@ -29,12 +28,6 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
29
28
  '--api-key STRING',
30
29
  '::HIDE::' # For backwards compatibility
31
30
  ) { |v| ENV['NCBI_API_KEY'] = v }
32
- opt.on(
33
- '--ncbi-table-file STRING',
34
- '::HIDE::' # Only meant for debugging
35
- # It can take the table returned by NCBI and parse it from a file
36
- # instead of downloading it directly
37
- ) { |v| cli[:ncbi_table_file] = v }
38
31
  opt.on(
39
32
  '--ncbi-api-key STRING',
40
33
  'NCBI API key'
@@ -8,7 +8,8 @@ module MiGA::Cli::Action::Wf
8
8
  cli.expect_files = true
9
9
  cli.defaults = {
10
10
  clean: false, project_type: :genomes, dataset_type: :popgenome,
11
- ncbi_draft: true, min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
11
+ ncbi_draft: true, ncbi_ref: false,
12
+ min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
12
13
  prepare_and_exit: false
13
14
  }
14
15
  end
@@ -39,14 +40,21 @@ module MiGA::Cli::Action::Wf
39
40
  '-T', '--ncbi-taxon STRING',
40
41
  'Download all the genomes in NCBI classified as this taxon'
41
42
  ) { |v| cli[:ncbi_taxon] = v }
43
+ opt.on(
44
+ '--no-draft', '::HIDE::' # Deprecated
45
+ ) { |v| cli[:ncbi_draft] = v }
46
+ opt.on(
47
+ '--ncbi-complete',
48
+ 'Only download complete genomes, not drafts (requires -T)'
49
+ ) { |v| cli[:ncbi_draft] = !v }
50
+ opt.on(
51
+ '--ncbi-ref',
52
+ 'Only download RefSeq reference genomes (requires -T)'
53
+ ) { |v| cli[:ncbi_ref] = v }
42
54
  opt.on(
43
55
  '-G', '--gtdb-taxon STRING',
44
56
  'Download all the genomes in GTDB classified as this taxon'
45
57
  ) { |v| cli[:gtdb_taxon] = v }
46
- opt.on(
47
- '--no-draft',
48
- 'Only download complete genomes, not drafts (requires -T)'
49
- ) { |v| cli[:ncbi_draft] = v }
50
58
  opt.on(
51
59
  '--gtdb-ref',
52
60
  'Only download reference anchor genomes in GTDB (requires -G)'
@@ -170,7 +178,8 @@ module MiGA::Cli::Action::Wf
170
178
  def download_datasets
171
179
  # Download datasets from NCBI
172
180
  unless cli[:ncbi_taxon].nil?
173
- what = cli[:ncbi_draft] ? '--all' : '--complete'
181
+ what = cli[:ncbi_ref] ? '--reference' :
182
+ cli[:ncbi_draft] ? '--all' : '--complete'
174
183
  cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
175
184
  cmd += ['--max', cli[:max_download]] if cli[:max_download]
176
185
  call_cli(cmd)
@@ -60,6 +60,9 @@ module MiGA::Cli::ObjectsHelper
60
60
  o &&= (d.ref? == self[:ref]) unless self[:ref].nil?
61
61
  o &&= (d.active? == self[:active]) unless self[:active].nil?
62
62
  o &&= (self[:multi] ? d.multi? : d.nonmulti?) unless self[:multi].nil?
63
+ unless self[:markers].nil?
64
+ o &&= (self[:markers] ? d.markers? : !d.markers?)
65
+ end
63
66
  unless self[:taxonomy].nil?
64
67
  o &&= !d.metadata[:tax].nil? && d.metadata[:tax].in?(self[:taxonomy])
65
68
  end
@@ -43,7 +43,7 @@ module MiGA::Cli::OptHelper
43
43
  '-h', '--help',
44
44
  'Display this screen'
45
45
  ) do
46
- puts opt.to_s.gsub(/^.*\s+::HIDE::\s*$/, '')
46
+ puts opt.to_a.select { |i| i !~ /\s::HIDE::\s/ }
47
47
  exit
48
48
  end
49
49
  opt.separator ''
@@ -120,10 +120,11 @@ module MiGA::Cli::OptHelper
120
120
  # as determined by +what+ an Array with any combination of:
121
121
  # - :ref To filter by reference (--ref) or query (--no-ref)
122
122
  # - :multi To filter by multiple (--multi) or single (--no-multi) species
123
+ # - :markers To filter by with (--markers) or without markers (--no-markers)
123
124
  # - :active To filter by active (--active) or inactive (--no-active)
124
125
  # - :taxonomy To filter by taxonomy (--taxonomy)
125
126
  # The "k-th" filter (--dataset-k) is always included
126
- def opt_filter_datasets(opt, what = %i[ref multi active taxonomy])
127
+ def opt_filter_datasets(opt, what = %i[ref multi markers active taxonomy])
127
128
  what.each do |w|
128
129
  case w
129
130
  when :ref
@@ -136,6 +137,11 @@ module MiGA::Cli::OptHelper
136
137
  '--[no-]multi',
137
138
  'Use only multi-species (or only single-species) datasets'
138
139
  ) { |v| self[:multi] = v }
140
+ when :markers
141
+ opt.on(
142
+ '--[no-]markers',
143
+ 'Use only datasets with (or without) markers'
144
+ ) { |v| self[:markers] = v }
139
145
  when :active
140
146
  opt.on(
141
147
  '--[no-]active',
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'net/http'
3
4
  require 'net/ftp'
4
5
  require 'open-uri'
5
6
  require 'fileutils'
@@ -10,6 +11,8 @@ Net::FTP.const_set('FTP_PORT', 21)
10
11
  ##
11
12
  # General web-access functions shared throughout MiGA.
12
13
  module MiGA::Common::Net
14
+ attr_accessor :remote_connection_uri
15
+
13
16
  ##
14
17
  # Returns the URL of the host +name+ (Symbol)
15
18
  def known_hosts(name)
@@ -21,7 +24,7 @@ module MiGA::Common::Net
21
24
  when :miga_dist
22
25
  "ftp://#{main_server}/dist"
23
26
  else
24
- raise "Unrecognized server name: #{host}"
27
+ raise "Unrecognized server name: #{name}"
25
28
  end
26
29
  end
27
30
 
@@ -32,49 +35,128 @@ module MiGA::Common::Net
32
35
  end
33
36
 
34
37
  ##
35
- # Connect to an FTP +host+ (String) or a known host name (Symbol, see
36
- # +.known_hosts+)
38
+ # Connect to an FTP +host+ (String), a known host name (Symbol, see
39
+ # +.known_hosts+), or a parsed +URI+ object
40
+ #
41
+ # Sets the attribute +remote_connection_uri+ to the parsed +URI+ object
42
+ # silently
37
43
  def remote_connection(host)
38
44
  host = known_hosts(host) if host.is_a?(Symbol)
39
- uri = URI.parse(host)
40
- raise 'Only FTP hosts are currently supported' unless uri.scheme == 'ftp'
41
-
42
- ftp = Net::FTP.new(uri.host)
43
- ftp.passive = true
44
- ftp.login
45
- ftp.chdir(uri.path)
46
- ftp
45
+ uri = host.is_a?(URI) ? host : URI.parse(host)
46
+ @remote_connection_uri = uri
47
+
48
+ case uri.scheme
49
+ when 'ftp'
50
+ ftp = Net::FTP.new(uri.host)
51
+ ftp.passive = true
52
+ ftp.login
53
+ ftp.chdir(uri.path) unless host.is_a?(URI)
54
+ ftp
55
+ when 'http', 'https'
56
+ http = Net::HTTP.new(uri.host, uri.port)
57
+ http.read_timeout = 600
58
+ http.use_ssl = uri.scheme == 'https'
59
+ http
60
+ else
61
+ raise 'Only FTP, HTTP, and HTTPS are currently supported'
62
+ end
47
63
  end
48
64
 
49
65
  ##
50
66
  # Download a file via FTP using the +connection+ (returned by
51
- # +.remote_connection+) with remote name +file+ into local +target+.
67
+ # +.remote_connection+) with remote name +file+ into local +target+. If +file+
68
+ # is +nil+, it tries to guess the file from +connection+. If +target+ is
69
+ # +nil+, it returns the read data instead
52
70
  #
53
- # Alternatively, +connection+ can simply be the host (String) or a recognized
54
- # Symbol (see +.remote_connection+), in which case the function opens the
55
- # connection automatically
71
+ # Alternatively, +connection+ can simply be the host (String), a recognized
72
+ # Symbol (see +.remote_connection+), or a parsed +URI+ object, in which case
73
+ # the function opens the connection automatically
56
74
  #
57
75
  # Reports progress to the function block with two arguments: the
58
76
  # currently transferred size and the total file size
59
- def download_file_ftp(connection, file, target)
77
+ def download_file_ftp(connection, file = nil, target = nil)
60
78
  # Open connection unless passed
61
79
  close_conn = false
62
- if connection.is_a?(String) || connection.is_a?(Symbol)
80
+ if connection.is_a?(String) || connection.is_a?(Symbol) ||
81
+ connection.is_a?(URI)
63
82
  connection = remote_connection(connection)
83
+ file ||= remote_connection_uri.path
64
84
  close_conn = true
65
85
  end
66
86
 
67
87
  # Prepare download
68
- FileUtils.mkdir_p(File.dirname(target))
88
+ FileUtils.mkdir_p(File.dirname(target)) if target
69
89
  filesize = connection.size(file)
70
90
  transferred = 0
71
91
 
72
92
  # Get in chunks of 1KiB
93
+ ret = ''
73
94
  connection.getbinaryfile(file, target, 1024) do |data|
74
95
  yield(transferred += data.size, filesize) if block_given?
96
+ ret += data unless target
75
97
  end
76
98
 
77
99
  # Close connection if automatically opened
78
100
  connection.close if close_conn
101
+ ret unless target
102
+ end
103
+
104
+ ##
105
+ # Submit an HTTP or HTTPS request using +url+, which should be a URL
106
+ # either as String or parsed URI. The request follows the +method+, which
107
+ # should be a Net::HTTP verb such as +:get+, +:post+, or +:patch+. All
108
+ # additional parameters for the corresponding method should be passed as
109
+ # +opts+.
110
+ def http_request(method, url, *opts)
111
+ doc = nil
112
+ remote_connection(url).start do |http|
113
+ res = http.send(method, remote_connection_uri.to_s, *opts)
114
+ if %w[301 302].include?(res.code)
115
+ DEBUG "REDIRECTION #{res.code}: #{res['location']}"
116
+ return http_request(method, res['location'], *opts)
117
+ end
118
+ res.value # To force exception unless success
119
+ doc = res.body
120
+ end
121
+ doc
122
+ end
123
+
124
+ def net_method(method, uri, *opts)
125
+ attempts ||= 0
126
+ DEBUG "#{method.to_s.upcase}: #{uri} #{opts}"
127
+ case method.to_sym
128
+ when :ftp
129
+ download_file_ftp(uri)
130
+ else
131
+ http_request(method, uri, *opts)
132
+ end
133
+ rescue => e
134
+ raise e if (attempts += 1) >= 3
135
+
136
+ sleep 5 # <- For: 429 Too Many Requests
137
+ DEBUG "RETRYING after: #{e}"
138
+ retry
139
+ end
140
+
141
+ alias :https_request :http_request
142
+
143
+ ##
144
+ # Normalize the encoding of +body+ to UTF-8 by attempting several
145
+ # common recodings. Code from https://github.com/seq-code/registry
146
+ def normalize_encoding(body)
147
+ # Test encodings
148
+ body.force_encoding('utf-8')
149
+ %w[iso8859-1 windows-1252 us-ascii ascii-8bit].each do |enc|
150
+ break if body.valid_encoding?
151
+ recode = body.force_encoding(enc).encode('utf-8')
152
+ body = recode if recode.valid_encoding?
153
+ end
154
+ # If nothing works, replace offending characters with '?'
155
+ unless body.valid_encoding?
156
+ body = body.encode(
157
+ 'utf-8', invalid: :replace, undef: :replace, replace: '?'
158
+ )
159
+ end
160
+ body
79
161
  end
80
162
  end
@@ -32,6 +32,12 @@ class MiGA::Dataset < MiGA::MiGA
32
32
  @@EXCLUDE_NOREF_TASKS
33
33
  end
34
34
 
35
+ ##
36
+ # Tasks to be excluded from datasets without markers
37
+ def EXCLUDE_NOMARKER_TASKS
38
+ @@EXCLUDE_NOMARKER_TASKS
39
+ end
40
+
35
41
  ##
36
42
  # Tasks to be executed only in datasets that are single-organism. These
37
43
  # tasks are ignored for multi-organism datasets or for unknown types
@@ -81,45 +87,67 @@ module MiGA::Dataset::Base
81
87
  # Supported dataset types
82
88
  @@KNOWN_TYPES = {
83
89
  genome: {
84
- description: 'The genome from an isolate', multi: false
90
+ description: 'The genome from an isolate',
91
+ multi: false, markers: true,
92
+ project_types: %i[mixed genomes clade]
85
93
  },
86
94
  scgenome: {
87
- description: 'A Single-cell Amplified Genome (SAG)', multi: false
95
+ description: 'A Single-cell Amplified Genome (SAG)',
96
+ multi: false, markers: true,
97
+ project_types: %i[mixed genomes clade]
88
98
  },
89
99
  popgenome: {
90
- description: 'A Metagenome-Assembled Genome (MAG)', multi: false
100
+ description: 'A Metagenome-Assembled Genome (MAG)',
101
+ multi: false, markers: true,
102
+ project_types: %i[mixed genomes clade]
91
103
  },
92
104
  metagenome: {
93
- description: 'A metagenome (excluding viromes)', multi: true
105
+ description: 'A metagenome (excluding viromes)',
106
+ multi: true, markers: true,
107
+ project_types: %i[mixed metagenomes]
94
108
  },
95
109
  virome: {
96
- description: 'A viral metagenome', multi: true
110
+ description: 'A viral metagenome',
111
+ multi: true,
112
+ markers: true, # <- We don't expect, but can be useful for contamination
113
+ project_types: %i[mixed metagenomes]
114
+ },
115
+ plasmid: {
116
+ description: 'An individual plasmid',
117
+ multi: false, markers: false,
118
+ project_types: %i[mixed plasmids]
97
119
  }
98
120
  }
99
121
 
100
122
  ##
101
123
  # Returns an Array of tasks (Symbols) to be executed before project-wide tasks
102
- @@PREPROCESSING_TASKS = [
103
- :raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta,
104
- :assembly, :cds, :essential_genes, :mytaxa, :mytaxa_scan,
105
- :taxonomy, :distances, :ssu, :stats
124
+ @@PREPROCESSING_TASKS = %i[
125
+ raw_reads trimmed_reads read_quality trimmed_fasta
126
+ assembly cds essential_genes mytaxa mytaxa_scan
127
+ taxonomy distances ssu stats
106
128
  ]
107
129
 
108
130
  ##
109
131
  # Tasks to be excluded from query datasets
110
- @@EXCLUDE_NOREF_TASKS = [:mytaxa_scan, :taxonomy]
132
+ @@EXCLUDE_NOREF_TASKS = %i[mytaxa_scan taxonomy]
111
133
  @@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map { |i| [i, true] }]
112
134
 
135
+ ##
136
+ # Tasks to be excluded from datasets without markers
137
+ @@EXCLUDE_NOMARKER_TASKS = %i[essential_genes ssu]
138
+ @@_EXCLUDE_NOMARKER_TASKS_H =
139
+ Hash[@@EXCLUDE_NOMARKER_TASKS.map { |i| [i, true] }]
140
+
113
141
  ##
114
142
  # Tasks to be executed only in datasets that are single-organism. These
115
143
  # tasks are ignored for multi-organism datasets or for unknown types
116
- @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :taxonomy, :distances]
144
+ @@ONLY_NONMULTI_TASKS = %i[mytaxa_scan taxonomy distances]
117
145
  @@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map { |i| [i, true] }]
118
146
 
119
147
  ##
120
148
  # Tasks to be executed only in datasets that are multi-organism. These
121
149
  # tasks are ignored for single-organism datasets or for unknwon types
122
- @@ONLY_MULTI_TASKS = [:mytaxa]
150
+ @@ONLY_MULTI_TASKS = %i[mytaxa]
123
151
  @@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map { |i| [i, true] }]
124
152
 
125
153
  ##
@@ -15,6 +15,7 @@ require 'miga/common/hooks'
15
15
  # Supported hooks:
16
16
  # - run_lambda(lambda, args...)
17
17
  # - recalculate_status()
18
+ # - check_type()
18
19
  # - clear_run_counts()
19
20
  # - run_cmd(cmd)
20
21
  # Internal hooks:
@@ -27,6 +28,7 @@ module MiGA::Dataset::Hooks
27
28
  def default_hooks
28
29
  {
29
30
  on_create: [[:recalculate_status]],
31
+ on_save: [[:check_type]],
30
32
  on_activate: [[:clear_run_counts], [:recalculate_status]],
31
33
  on_inactivate: [[:recalculate_status]],
32
34
  on_result_ready: [[:_pull_result_hooks]],
@@ -51,6 +53,12 @@ module MiGA::Dataset::Hooks
51
53
  recalculate_status
52
54
  end
53
55
 
56
+ ##
57
+ # Ensure that the dataset type exists and is compatible with the project type
58
+ def hook_check_type(_hook_args, _event_args)
59
+ check_type
60
+ end
61
+
54
62
  ##
55
63
  # Run +cmd+ in the command-line with {{variables}}:
56
64
  # dataset, project, project_name, miga, object (if defined for the event)