miga-base 1.3.8.2 → 1.3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -2
  3. data/lib/miga/cli/action/add_result.rb +22 -1
  4. data/lib/miga/cli/action/browse/about.html +4 -2
  5. data/lib/miga/cli/action/download/gtdb.rb +1 -1
  6. data/lib/miga/cli/action/download/ncbi.rb +43 -68
  7. data/lib/miga/cli/action/download/seqcode.rb +1 -2
  8. data/lib/miga/cli/action/ncbi_get.rb +1 -8
  9. data/lib/miga/cli/action/wf.rb +15 -6
  10. data/lib/miga/cli/objects_helper.rb +3 -0
  11. data/lib/miga/cli/opt_helper.rb +8 -2
  12. data/lib/miga/common/net.rb +100 -18
  13. data/lib/miga/dataset/base.rb +40 -12
  14. data/lib/miga/dataset/hooks.rb +8 -0
  15. data/lib/miga/dataset/result/ignore.rb +14 -2
  16. data/lib/miga/dataset/type.rb +51 -0
  17. data/lib/miga/dataset.rb +3 -22
  18. data/lib/miga/json.rb +9 -0
  19. data/lib/miga/project/base.rb +15 -9
  20. data/lib/miga/project.rb +7 -1
  21. data/lib/miga/remote_dataset/base.rb +117 -36
  22. data/lib/miga/remote_dataset/download.rb +121 -54
  23. data/lib/miga/remote_dataset.rb +34 -13
  24. data/lib/miga/result/stats.rb +2 -0
  25. data/lib/miga/result/versions.rb +23 -0
  26. data/lib/miga/result.rb +7 -1
  27. data/lib/miga/taxonomy/base.rb +3 -2
  28. data/lib/miga/version.rb +2 -2
  29. data/scripts/assembly.bash +15 -1
  30. data/scripts/cds.bash +9 -3
  31. data/scripts/distances.bash +103 -5
  32. data/scripts/essential_genes.bash +14 -1
  33. data/scripts/mytaxa.bash +18 -3
  34. data/scripts/mytaxa_scan.bash +16 -3
  35. data/scripts/read_quality.bash +6 -2
  36. data/scripts/ssu.bash +19 -1
  37. data/scripts/stats.bash +9 -3
  38. data/scripts/taxonomy.bash +98 -2
  39. data/scripts/trimmed_fasta.bash +10 -2
  40. data/scripts/trimmed_reads.bash +26 -6
  41. data/test/dataset_test.rb +17 -2
  42. data/test/hook_test.rb +3 -2
  43. data/test/net_test.rb +21 -5
  44. data/test/project_test.rb +13 -0
  45. data/test/remote_dataset_test.rb +106 -7
  46. data/test/result_test.rb +47 -21
  47. data/test/taxonomy_test.rb +9 -3
  48. data/utils/distance/runner.rb +3 -1
  49. data/utils/distances.rb +1 -1
  50. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2e4abdd418e396b20fcfea4beaf4d70f3b0022808714478581e2e1f6f8a85478
4
- data.tar.gz: 881884c4d5a933b64ac29e93397b62b7d1dc30fd599baaa8b6ac04a3ad3ab051
3
+ metadata.gz: 7a4aa208ac4dfe7ff6edbedb7aa7b3444c909c77e47b2ef1b93282adc83192d6
4
+ data.tar.gz: 78bdce8752b3a1a1a281123a99cccdc002d0f95a2a345aa16b1c4ee684220c29
5
5
  SHA512:
6
- metadata.gz: 317022ffe39818af13d36e3bf1e1adea80b3ab3aafb265b3df6371b6b5e87efdb75199cf8bd22aea37a63e941f824d1686f85bdb7a38eda3f4ba2a76c535b86b
7
- data.tar.gz: 0d42dadd982374f0c618d76fc030f5665075e9973ca8e8615c5b090ab1dcc6bf663901a8f7355ce3bfa6874941d9098957d31704b6369474e516dd5ebf8c0b6b
6
+ metadata.gz: 335e7a8715a6c561b618e21d1e273381f88d384daf16360c01f9b6818ec374f05f6ae824a36628fbe4dcb0c7d6e082f633e8372137edfa035389cb92922363fa
7
+ data.tar.gz: 76bdf75348e711bdfcca6b973acd3768559acde7ee80233833ae8b8caa595020599756b7913774dc53c038290592d20710b1b67bd5fa8bb93fd334206bf21b06
data/README.md CHANGED
@@ -12,7 +12,7 @@
12
12
  For additional information on MiGA, visit:
13
13
 
14
14
  * [MiGA Online][miga-online]: The Microbial Genomes Atlas Online
15
- * [MiGA@XSEDE][miga-at-xsede]: The MiGA@XSEDE Gateway
15
+ * [MiGA Gateway][miga-gatewat]: The MiGA Science Gateway
16
16
  * [MiGA users list][mailing-list]:
17
17
  Forum to discuss with other users and developers
18
18
  * [MiGA manual][manual]: The definitive guide to MiGA
@@ -57,6 +57,6 @@ See [LICENSE](LICENSE).
57
57
  [miga-web]: https://github.com/bio-miga/miga-web
58
58
  [miga-gui]: https://github.com/bio-miga/miga-gui
59
59
  [miga-online]: http://microbial-genomes.org/
60
- [miga-at-xsede]: https://xsede.microbial-genomes.org/
60
+ [miga-gateway]: https://gateway.microbial-genomes.org/
61
61
  [kostas]: http://enve-omics.gatech.edu/
62
62
  [rdp]: http://rdp.cme.msu.edu/
@@ -5,13 +5,17 @@ require 'miga/cli/action'
5
5
 
6
6
  class MiGA::Cli::Action::AddResult < MiGA::Cli::Action
7
7
  def parse_cli
8
- cli.defaults = { force: false }
8
+ cli.defaults = { force: false, stdin_versions: false }
9
9
  cli.parse do |opt|
10
10
  cli.opt_object(opt, [:project, :dataset_opt, :result])
11
11
  opt.on(
12
12
  '-f', '--force',
13
13
  'Force re-indexing of the result even if it\'s already registered'
14
14
  ) { |v| cli[:force] = v }
15
+ opt.on(
16
+ '--stdin-versions',
17
+ 'Read Software versions from STDIN'
18
+ ) { |v| cli[:stdin_versions] = v }
15
19
  end
16
20
  end
17
21
 
@@ -21,5 +25,22 @@ class MiGA::Cli::Action::AddResult < MiGA::Cli::Action
21
25
  cli.say "Registering result: #{cli[:result]}"
22
26
  r = obj.add_result(cli[:result], true, force: cli[:force])
23
27
  raise 'Cannot add result, incomplete expected files' if r.nil?
28
+
29
+ # Add Software version data
30
+ if cli[:stdin_versions]
31
+ versions = {}
32
+ sw = nil
33
+ $stdin.each do |ln|
34
+ ln = ln.chomp.strip
35
+ if ln =~ /^=> (.*)/
36
+ sw = $1
37
+ versions[sw] = ''
38
+ else
39
+ versions[sw] += ln
40
+ end
41
+ end
42
+ r.add_versions(versions)
43
+ r.save
44
+ end
24
45
  end
25
46
  end
@@ -12,10 +12,12 @@ terms of the terms of the
12
12
  <p>
13
13
  MiGA is the result of a collaboration between the
14
14
  <a href='http://enve-omics.gatech.edu/'>Kostas Lab</a>
15
- (<a href='http://www.gatech.edu/'>Georgia Institute of Technology</a>) and the
15
+ (<a href='http://www.gatech.edu/'>Georgia Institute of Technology</a>), the
16
16
  <a href='http://rdp.cme.msu.edu/'>RDP team</a>
17
17
  (<a href='http://cme.msu.edu/'>Center for Microbial Ecology</a>,
18
- <a href='https://msu.edu/'>Michigan State University</a>).
18
+ <a href='https://msu.edu/'>Michigan State University</a>), and the
19
+ <a href="https://disc-genomics.uibk.ac.at/">Rodriguez-R lab</a>
20
+ (<a href="https://uibk.ac.at/">University of Innsbruck</a>).
19
21
  The MiGA project is funded by the
20
22
  <a href='http://nsf.gov/'>US National Science Foundation</a>
21
23
  (Awards <a href='http://nsf.gov/awardsearch/showAward?AWD_ID=1356288'>#1356288</a> &amp;
@@ -31,7 +31,7 @@ module MiGA::Cli::Action::Download::Gtdb
31
31
 
32
32
  def remote_list
33
33
  cli.say 'Downloading genome list'
34
- extra = ['sp_reps_only=' + cli[:reference].to_s]
34
+ extra = { sp_reps_only: cli[:reference].to_s }
35
35
  json = MiGA::RemoteDataset.download(
36
36
  :gtdb, :taxon, cli[:taxon], :genomes, nil, extra
37
37
  )
@@ -34,11 +34,8 @@ module MiGA::Cli::Action::Download::Ncbi
34
34
  'Do not add sequence version to the dataset name',
35
35
  'Only affects --complete and --chromosome'
36
36
  ) { |v| cli[:add_version] = v }
37
- cli.opt_flag(
38
- opt, 'legacy-name',
39
- 'Use dataset names based on chromosome entries instead of assembly',
40
- :legacy_name
41
- )
37
+ # For backwards compatibility
38
+ cli.opt_flag(opt, 'legacy-name', '::HIDE::', :legacy_name)
42
39
  end
43
40
 
44
41
  def sanitize_cli
@@ -52,89 +49,67 @@ module MiGA::Cli::Action::Download::Ncbi
52
49
  end
53
50
 
54
51
  def remote_list
55
- doc =
56
- if cli[:ncbi_table_file]
57
- cli.say 'Reading genome list from file'
58
- File.open(cli[:ncbi_table_file], 'r')
59
- else
60
- cli.say 'Downloading genome list'
61
- url = remote_list_url
62
- MiGA::RemoteDataset.download_url(url)
63
- end
64
- ds = parse_csv_as_datasets(doc)
65
- doc.close if cli[:ncbi_table_file]
66
- ds
52
+ list = {}
53
+ query = remote_list_query
54
+ loop do
55
+ # Query the remote collection
56
+ page = MiGA::Json.parse(
57
+ MiGA::RemoteDataset.download(:ncbi_datasets, :genome, query, :json),
58
+ contents: true
59
+ )
60
+ break unless page&.any? && page[:reports]&.any?
61
+
62
+ # Process reports in this page
63
+ list.merge!(parse_reports_as_datasets(page[:reports]))
64
+
65
+ # Next page
66
+ break unless page[:next_page_token]
67
+ query[:page_token] = page[:next_page_token]
68
+ end
69
+ list
67
70
  end
68
71
 
69
- def parse_csv_as_datasets(doc)
72
+ def parse_reports_as_datasets(reports)
70
73
  ds = {}
71
- CSV.parse(doc, headers: true).each do |r|
72
- asm = r['assembly']
74
+ reports.each do |r|
75
+ asm = r[:accession]
73
76
  next if asm.nil? || asm.empty? || asm == '-'
74
77
 
75
- rep = remote_row_replicons(r)
76
- n = remote_row_name(r, rep, asm)
77
-
78
78
  # Register for download
79
+ n = remote_report_name(r, asm)
79
80
  ds[n] = {
80
81
  ids: [asm], db: :assembly, universe: :ncbi,
81
82
  md: {
82
- type: :genome, ncbi_asm: asm, strain: r['strain']
83
+ type: :genome, ncbi_asm: asm, strain: r.dig(:organism, :infraspecific_names, :strain)
83
84
  }
84
85
  }
85
- ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
86
- unless r['release_date'].nil?
87
- ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
88
- end
86
+ date = r.dig(:assembly_info, :release_date)
87
+ ds[n][:md][:release_date] = Time.parse(date).to_s if date
88
+ ds[n][:md][:ncbi_dataset] = r
89
89
  end
90
90
  ds
91
91
  end
92
92
 
93
- def remote_row_replicons(r)
94
- return if r['replicons'].nil?
95
-
96
- r['replicons']
97
- .split('; ')
98
- .map { |i| i.gsub(/.*:/, '') }
99
- .map { |i| i.gsub(%r{/.*}, '') }
100
- end
101
-
102
- def remote_row_name(r, rep, asm)
103
- return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
104
-
105
- if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
106
- acc = rep.nil? ? '' : rep.first
107
- else
108
- acc = asm
109
- end
93
+ def remote_report_name(r, asm)
94
+ acc = "#{asm}"
110
95
  acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
111
- "#{r['#organism']}_#{acc}".miga_name
96
+ org = r.dig(:organism, :organism_name)
97
+ acc = "#{org}_#{acc}" if org
98
+ acc.miga_name
112
99
  end
113
100
 
114
- def remote_list_url
115
- url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
116
- url_param = {
117
- q: '[display()].' \
118
- 'from(GenomeAssemblies).' \
119
- 'usingschema(/schema/GenomeAssemblies).' \
120
- 'matching(tab==["Prokaryotes"] and q=="' \
121
- "#{cli[:taxon]&.tr('"', "'")}\"",
122
- fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
123
- 'level|level,release_date|release_date,strain|strain',
124
- nolimit: 'on'
125
- }
101
+ def remote_list_query
102
+ q = { taxons: [cli[:taxon]], filters: {} }
126
103
  if cli[:reference]
127
- url_param[:q] += ' and refseq_category==["representative"]'
104
+ q[:filters][:reference_only] = true
128
105
  else
129
- status = {
130
- complete: 'Complete',
131
- chromosome: ' Chromosome', # <- The leading space is *VERY* important!
132
- scaffold: 'Scaffold',
133
- contig: 'Contig'
134
- }.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
135
- url_param[:q] += ' and level==[' + status + ']'
106
+ q[:assembly_level] = {
107
+ contig: 'contig',
108
+ scaffold: 'scaffold',
109
+ chromosome: 'chromosome',
110
+ complete: 'complete_genome'
111
+ }.map { |k, v| '"' + v + '"' if cli[k] }.compact
136
112
  end
137
- url_param[:q] += ')'
138
- url_base + URI.encode_www_form(url_param)
113
+ q
139
114
  end
140
115
  end
@@ -29,8 +29,7 @@ module MiGA::Cli::Action::Download::Seqcode
29
29
 
30
30
  while current_page <= total_pages
31
31
  json = MiGA::RemoteDataset.download(
32
- :seqcode, :'type-genomes', nil, :json, nil,
33
- ["page=#{current_page}"]
32
+ :seqcode, :'type-genomes', nil, :json, nil, page: current_page
34
33
  )
35
34
  doc = MiGA::Json.parse(json, contents: true)
36
35
  current_page = doc[:current_page] + 1
@@ -8,8 +8,7 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
8
8
 
9
9
  def parse_cli
10
10
  cli.defaults = {
11
- query: false, unlink: false,
12
- reference: false, legacy_name: false,
11
+ query: false, unlink: false, reference: false,
13
12
  complete: false, chromosome: false,
14
13
  scaffold: false, contig: false, add_version: true, dry: false,
15
14
  get_md: false, only_md: false, save_every: 1
@@ -29,12 +28,6 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
29
28
  '--api-key STRING',
30
29
  '::HIDE::' # For backwards compatibility
31
30
  ) { |v| ENV['NCBI_API_KEY'] = v }
32
- opt.on(
33
- '--ncbi-table-file STRING',
34
- '::HIDE::' # Only meant for debugging
35
- # It can take the table returned by NCBI and parse it from a file
36
- # instead of downloading it directly
37
- ) { |v| cli[:ncbi_table_file] = v }
38
31
  opt.on(
39
32
  '--ncbi-api-key STRING',
40
33
  'NCBI API key'
@@ -8,7 +8,8 @@ module MiGA::Cli::Action::Wf
8
8
  cli.expect_files = true
9
9
  cli.defaults = {
10
10
  clean: false, project_type: :genomes, dataset_type: :popgenome,
11
- ncbi_draft: true, min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
11
+ ncbi_draft: true, ncbi_ref: false,
12
+ min_qual: MiGA::Project.OPTIONS[:min_qual][:default],
12
13
  prepare_and_exit: false
13
14
  }
14
15
  end
@@ -39,14 +40,21 @@ module MiGA::Cli::Action::Wf
39
40
  '-T', '--ncbi-taxon STRING',
40
41
  'Download all the genomes in NCBI classified as this taxon'
41
42
  ) { |v| cli[:ncbi_taxon] = v }
43
+ opt.on(
44
+ '--no-draft', '::HIDE::' # Deprecated
45
+ ) { |v| cli[:ncbi_draft] = v }
46
+ opt.on(
47
+ '--ncbi-complete',
48
+ 'Only download complete genomes, not drafts (requires -T)'
49
+ ) { |v| cli[:ncbi_draft] = !v }
50
+ opt.on(
51
+ '--ncbi-ref',
52
+ 'Only download RefSeq reference genomes (requires -T)'
53
+ ) { |v| cli[:ncbi_ref] = v }
42
54
  opt.on(
43
55
  '-G', '--gtdb-taxon STRING',
44
56
  'Download all the genomes in GTDB classified as this taxon'
45
57
  ) { |v| cli[:gtdb_taxon] = v }
46
- opt.on(
47
- '--no-draft',
48
- 'Only download complete genomes, not drafts (requires -T)'
49
- ) { |v| cli[:ncbi_draft] = v }
50
58
  opt.on(
51
59
  '--gtdb-ref',
52
60
  'Only download reference anchor genomes in GTDB (requires -G)'
@@ -170,7 +178,8 @@ module MiGA::Cli::Action::Wf
170
178
  def download_datasets
171
179
  # Download datasets from NCBI
172
180
  unless cli[:ncbi_taxon].nil?
173
- what = cli[:ncbi_draft] ? '--all' : '--complete'
181
+ what = cli[:ncbi_ref] ? '--reference' :
182
+ cli[:ncbi_draft] ? '--all' : '--complete'
174
183
  cmd = ['ncbi_get', '-P', cli[:outdir], '-T', cli[:ncbi_taxon], what]
175
184
  cmd += ['--max', cli[:max_download]] if cli[:max_download]
176
185
  call_cli(cmd)
@@ -60,6 +60,9 @@ module MiGA::Cli::ObjectsHelper
60
60
  o &&= (d.ref? == self[:ref]) unless self[:ref].nil?
61
61
  o &&= (d.active? == self[:active]) unless self[:active].nil?
62
62
  o &&= (self[:multi] ? d.multi? : d.nonmulti?) unless self[:multi].nil?
63
+ unless self[:markers].nil?
64
+ o &&= (self[:markers] ? d.markers? : !d.markers?)
65
+ end
63
66
  unless self[:taxonomy].nil?
64
67
  o &&= !d.metadata[:tax].nil? && d.metadata[:tax].in?(self[:taxonomy])
65
68
  end
@@ -43,7 +43,7 @@ module MiGA::Cli::OptHelper
43
43
  '-h', '--help',
44
44
  'Display this screen'
45
45
  ) do
46
- puts opt.to_s.gsub(/^.*\s+::HIDE::\s*$/, '')
46
+ puts opt.to_a.select { |i| i !~ /\s::HIDE::\s/ }
47
47
  exit
48
48
  end
49
49
  opt.separator ''
@@ -120,10 +120,11 @@ module MiGA::Cli::OptHelper
120
120
  # as determined by +what+ an Array with any combination of:
121
121
  # - :ref To filter by reference (--ref) or query (--no-ref)
122
122
  # - :multi To filter by multiple (--multi) or single (--no-multi) species
123
+ # - :markers To filter by with (--markers) or without markers (--no-markers)
123
124
  # - :active To filter by active (--active) or inactive (--no-active)
124
125
  # - :taxonomy To filter by taxonomy (--taxonomy)
125
126
  # The "k-th" filter (--dataset-k) is always included
126
- def opt_filter_datasets(opt, what = %i[ref multi active taxonomy])
127
+ def opt_filter_datasets(opt, what = %i[ref multi markers active taxonomy])
127
128
  what.each do |w|
128
129
  case w
129
130
  when :ref
@@ -136,6 +137,11 @@ module MiGA::Cli::OptHelper
136
137
  '--[no-]multi',
137
138
  'Use only multi-species (or only single-species) datasets'
138
139
  ) { |v| self[:multi] = v }
140
+ when :markers
141
+ opt.on(
142
+ '--[no-]markers',
143
+ 'Use only datasets with (or without) markers'
144
+ ) { |v| self[:markers] = v }
139
145
  when :active
140
146
  opt.on(
141
147
  '--[no-]active',
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'net/http'
3
4
  require 'net/ftp'
4
5
  require 'open-uri'
5
6
  require 'fileutils'
@@ -10,6 +11,8 @@ Net::FTP.const_set('FTP_PORT', 21)
10
11
  ##
11
12
  # General web-access functions shared throughout MiGA.
12
13
  module MiGA::Common::Net
14
+ attr_accessor :remote_connection_uri
15
+
13
16
  ##
14
17
  # Returns the URL of the host +name+ (Symbol)
15
18
  def known_hosts(name)
@@ -21,7 +24,7 @@ module MiGA::Common::Net
21
24
  when :miga_dist
22
25
  "ftp://#{main_server}/dist"
23
26
  else
24
- raise "Unrecognized server name: #{host}"
27
+ raise "Unrecognized server name: #{name}"
25
28
  end
26
29
  end
27
30
 
@@ -32,49 +35,128 @@ module MiGA::Common::Net
32
35
  end
33
36
 
34
37
  ##
35
- # Connect to an FTP +host+ (String) or a known host name (Symbol, see
36
- # +.known_hosts+)
38
+ # Connect to an FTP +host+ (String), a known host name (Symbol, see
39
+ # +.known_hosts+), or a parsed +URI+ object
40
+ #
41
+ # Sets the attribute +remote_connection_uri+ to the parsed +URI+ object
42
+ # silently
37
43
  def remote_connection(host)
38
44
  host = known_hosts(host) if host.is_a?(Symbol)
39
- uri = URI.parse(host)
40
- raise 'Only FTP hosts are currently supported' unless uri.scheme == 'ftp'
41
-
42
- ftp = Net::FTP.new(uri.host)
43
- ftp.passive = true
44
- ftp.login
45
- ftp.chdir(uri.path)
46
- ftp
45
+ uri = host.is_a?(URI) ? host : URI.parse(host)
46
+ @remote_connection_uri = uri
47
+
48
+ case uri.scheme
49
+ when 'ftp'
50
+ ftp = Net::FTP.new(uri.host)
51
+ ftp.passive = true
52
+ ftp.login
53
+ ftp.chdir(uri.path) unless host.is_a?(URI)
54
+ ftp
55
+ when 'http', 'https'
56
+ http = Net::HTTP.new(uri.host, uri.port)
57
+ http.read_timeout = 600
58
+ http.use_ssl = uri.scheme == 'https'
59
+ http
60
+ else
61
+ raise 'Only FTP, HTTP, and HTTPS are currently supported'
62
+ end
47
63
  end
48
64
 
49
65
  ##
50
66
  # Download a file via FTP using the +connection+ (returned by
51
- # +.remote_connection+) with remote name +file+ into local +target+.
67
+ # +.remote_connection+) with remote name +file+ into local +target+. If +file+
68
+ # is +nil+, it tries to guess the file from +connection+. If +target+ is
69
+ # +nil+, it returns the read data instead
52
70
  #
53
- # Alternatively, +connection+ can simply be the host (String) or a recognized
54
- # Symbol (see +.remote_connection+), in which case the function opens the
55
- # connection automatically
71
+ # Alternatively, +connection+ can simply be the host (String), a recognized
72
+ # Symbol (see +.remote_connection+), or a parsed +URI+ object, in which case
73
+ # the function opens the connection automatically
56
74
  #
57
75
  # Reports progress to the function block with two arguments: the
58
76
  # currently transferred size and the total file size
59
- def download_file_ftp(connection, file, target)
77
+ def download_file_ftp(connection, file = nil, target = nil)
60
78
  # Open connection unless passed
61
79
  close_conn = false
62
- if connection.is_a?(String) || connection.is_a?(Symbol)
80
+ if connection.is_a?(String) || connection.is_a?(Symbol) ||
81
+ connection.is_a?(URI)
63
82
  connection = remote_connection(connection)
83
+ file ||= remote_connection_uri.path
64
84
  close_conn = true
65
85
  end
66
86
 
67
87
  # Prepare download
68
- FileUtils.mkdir_p(File.dirname(target))
88
+ FileUtils.mkdir_p(File.dirname(target)) if target
69
89
  filesize = connection.size(file)
70
90
  transferred = 0
71
91
 
72
92
  # Get in chunks of 1KiB
93
+ ret = ''
73
94
  connection.getbinaryfile(file, target, 1024) do |data|
74
95
  yield(transferred += data.size, filesize) if block_given?
96
+ ret += data unless target
75
97
  end
76
98
 
77
99
  # Close connection if automatically opened
78
100
  connection.close if close_conn
101
+ ret unless target
102
+ end
103
+
104
+ ##
105
+ # Submit an HTTP or HTTPS request using +url+, which should be a URL
106
+ # either as String or parsed URI. The request follows the +method+, which
107
+ # should be a Net::HTTP verb such as +:get+, +:post+, or +:patch+. All
108
+ # additional parameters for the corresponding method should be passed as
109
+ # +opts+.
110
+ def http_request(method, url, *opts)
111
+ doc = nil
112
+ remote_connection(url).start do |http|
113
+ res = http.send(method, remote_connection_uri.to_s, *opts)
114
+ if %w[301 302].include?(res.code)
115
+ DEBUG "REDIRECTION #{res.code}: #{res['location']}"
116
+ return http_request(method, res['location'], *opts)
117
+ end
118
+ res.value # To force exception unless success
119
+ doc = res.body
120
+ end
121
+ doc
122
+ end
123
+
124
+ def net_method(method, uri, *opts)
125
+ attempts ||= 0
126
+ DEBUG "#{method.to_s.upcase}: #{uri} #{opts}"
127
+ case method.to_sym
128
+ when :ftp
129
+ download_file_ftp(uri)
130
+ else
131
+ http_request(method, uri, *opts)
132
+ end
133
+ rescue => e
134
+ raise e if (attempts += 1) >= 3
135
+
136
+ sleep 5 # <- For: 429 Too Many Requests
137
+ DEBUG "RETRYING after: #{e}"
138
+ retry
139
+ end
140
+
141
+ alias :https_request :http_request
142
+
143
+ ##
144
+ # Normalize the encoding of +body+ to UTF-8 by attempting several
145
+ # common recodings. Code from https://github.com/seq-code/registry
146
+ def normalize_encoding(body)
147
+ # Test encodings
148
+ body.force_encoding('utf-8')
149
+ %w[iso8859-1 windows-1252 us-ascii ascii-8bit].each do |enc|
150
+ break if body.valid_encoding?
151
+ recode = body.force_encoding(enc).encode('utf-8')
152
+ body = recode if recode.valid_encoding?
153
+ end
154
+ # If nothing works, replace offending characters with '?'
155
+ unless body.valid_encoding?
156
+ body = body.encode(
157
+ 'utf-8', invalid: :replace, undef: :replace, replace: '?'
158
+ )
159
+ end
160
+ body
79
161
  end
80
162
  end
@@ -32,6 +32,12 @@ class MiGA::Dataset < MiGA::MiGA
32
32
  @@EXCLUDE_NOREF_TASKS
33
33
  end
34
34
 
35
+ ##
36
+ # Tasks to be excluded from datasets without markers
37
+ def EXCLUDE_NOMARKER_TASKS
38
+ @@EXCLUDE_NOMARKER_TASKS
39
+ end
40
+
35
41
  ##
36
42
  # Tasks to be executed only in datasets that are single-organism. These
37
43
  # tasks are ignored for multi-organism datasets or for unknown types
@@ -81,45 +87,67 @@ module MiGA::Dataset::Base
81
87
  # Supported dataset types
82
88
  @@KNOWN_TYPES = {
83
89
  genome: {
84
- description: 'The genome from an isolate', multi: false
90
+ description: 'The genome from an isolate',
91
+ multi: false, markers: true,
92
+ project_types: %i[mixed genomes clade]
85
93
  },
86
94
  scgenome: {
87
- description: 'A Single-cell Amplified Genome (SAG)', multi: false
95
+ description: 'A Single-cell Amplified Genome (SAG)',
96
+ multi: false, markers: true,
97
+ project_types: %i[mixed genomes clade]
88
98
  },
89
99
  popgenome: {
90
- description: 'A Metagenome-Assembled Genome (MAG)', multi: false
100
+ description: 'A Metagenome-Assembled Genome (MAG)',
101
+ multi: false, markers: true,
102
+ project_types: %i[mixed genomes clade]
91
103
  },
92
104
  metagenome: {
93
- description: 'A metagenome (excluding viromes)', multi: true
105
+ description: 'A metagenome (excluding viromes)',
106
+ multi: true, markers: true,
107
+ project_types: %i[mixed metagenomes]
94
108
  },
95
109
  virome: {
96
- description: 'A viral metagenome', multi: true
110
+ description: 'A viral metagenome',
111
+ multi: true,
112
+ markers: true, # <- We don't expect, but can be useful for contamination
113
+ project_types: %i[mixed metagenomes]
114
+ },
115
+ plasmid: {
116
+ description: 'An individual plasmid',
117
+ multi: false, markers: false,
118
+ project_types: %i[mixed plasmids]
97
119
  }
98
120
  }
99
121
 
100
122
  ##
101
123
  # Returns an Array of tasks (Symbols) to be executed before project-wide tasks
102
- @@PREPROCESSING_TASKS = [
103
- :raw_reads, :trimmed_reads, :read_quality, :trimmed_fasta,
104
- :assembly, :cds, :essential_genes, :mytaxa, :mytaxa_scan,
105
- :taxonomy, :distances, :ssu, :stats
124
+ @@PREPROCESSING_TASKS = %i[
125
+ raw_reads trimmed_reads read_quality trimmed_fasta
126
+ assembly cds essential_genes mytaxa mytaxa_scan
127
+ taxonomy distances ssu stats
106
128
  ]
107
129
 
108
130
  ##
109
131
  # Tasks to be excluded from query datasets
110
- @@EXCLUDE_NOREF_TASKS = [:mytaxa_scan, :taxonomy]
132
+ @@EXCLUDE_NOREF_TASKS = %i[mytaxa_scan taxonomy]
111
133
  @@_EXCLUDE_NOREF_TASKS_H = Hash[@@EXCLUDE_NOREF_TASKS.map { |i| [i, true] }]
112
134
 
135
+ ##
136
+ # Tasks to be excluded from datasets without markers
137
+ @@EXCLUDE_NOMARKER_TASKS = %i[essential_genes ssu]
138
+ @@_EXCLUDE_NOMARKER_TASKS_H =
139
+ Hash[@@EXCLUDE_NOMARKER_TASKS.map { |i| [i, true] }]
140
+
113
141
  ##
114
142
  # Tasks to be executed only in datasets that are single-organism. These
115
143
  # tasks are ignored for multi-organism datasets or for unknown types
116
- @@ONLY_NONMULTI_TASKS = [:mytaxa_scan, :taxonomy, :distances]
144
+ @@ONLY_NONMULTI_TASKS = %i[mytaxa_scan taxonomy distances]
117
145
  @@_ONLY_NONMULTI_TASKS_H = Hash[@@ONLY_NONMULTI_TASKS.map { |i| [i, true] }]
118
146
 
119
147
  ##
120
148
  # Tasks to be executed only in datasets that are multi-organism. These
121
149
  # tasks are ignored for single-organism datasets or for unknwon types
122
- @@ONLY_MULTI_TASKS = [:mytaxa]
150
+ @@ONLY_MULTI_TASKS = %i[mytaxa]
123
151
  @@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map { |i| [i, true] }]
124
152
 
125
153
  ##
@@ -15,6 +15,7 @@ require 'miga/common/hooks'
15
15
  # Supported hooks:
16
16
  # - run_lambda(lambda, args...)
17
17
  # - recalculate_status()
18
+ # - check_type()
18
19
  # - clear_run_counts()
19
20
  # - run_cmd(cmd)
20
21
  # Internal hooks:
@@ -27,6 +28,7 @@ module MiGA::Dataset::Hooks
27
28
  def default_hooks
28
29
  {
29
30
  on_create: [[:recalculate_status]],
31
+ on_save: [[:check_type]],
30
32
  on_activate: [[:clear_run_counts], [:recalculate_status]],
31
33
  on_inactivate: [[:recalculate_status]],
32
34
  on_result_ready: [[:_pull_result_hooks]],
@@ -51,6 +53,12 @@ module MiGA::Dataset::Hooks
51
53
  recalculate_status
52
54
  end
53
55
 
56
+ ##
57
+ # Ensure that the dataset type exists and is compatible with the project type
58
+ def hook_check_type(_hook_args, _event_args)
59
+ check_type
60
+ end
61
+
54
62
  ##
55
63
  # Run +cmd+ in the command-line with {{variables}}:
56
64
  # dataset, project, project_name, miga, object (if defined for the event)