miga-base 1.3.8.2 → 1.3.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -2
  3. data/lib/miga/cli/action/add_result.rb +22 -1
  4. data/lib/miga/cli/action/browse/about.html +4 -2
  5. data/lib/miga/cli/action/download/gtdb.rb +1 -1
  6. data/lib/miga/cli/action/download/ncbi.rb +43 -68
  7. data/lib/miga/cli/action/download/seqcode.rb +1 -2
  8. data/lib/miga/cli/action/ncbi_get.rb +1 -8
  9. data/lib/miga/cli/action/wf.rb +15 -6
  10. data/lib/miga/cli/objects_helper.rb +3 -0
  11. data/lib/miga/cli/opt_helper.rb +8 -2
  12. data/lib/miga/common/net.rb +100 -18
  13. data/lib/miga/dataset/base.rb +40 -12
  14. data/lib/miga/dataset/hooks.rb +8 -0
  15. data/lib/miga/dataset/result/ignore.rb +14 -2
  16. data/lib/miga/dataset/type.rb +51 -0
  17. data/lib/miga/dataset.rb +3 -22
  18. data/lib/miga/json.rb +9 -0
  19. data/lib/miga/project/base.rb +15 -9
  20. data/lib/miga/project.rb +7 -1
  21. data/lib/miga/remote_dataset/base.rb +117 -36
  22. data/lib/miga/remote_dataset/download.rb +121 -54
  23. data/lib/miga/remote_dataset.rb +34 -13
  24. data/lib/miga/result/stats.rb +2 -0
  25. data/lib/miga/result/versions.rb +23 -0
  26. data/lib/miga/result.rb +7 -1
  27. data/lib/miga/taxonomy/base.rb +3 -2
  28. data/lib/miga/version.rb +2 -2
  29. data/scripts/assembly.bash +15 -1
  30. data/scripts/cds.bash +9 -3
  31. data/scripts/distances.bash +103 -5
  32. data/scripts/essential_genes.bash +14 -1
  33. data/scripts/mytaxa.bash +18 -3
  34. data/scripts/mytaxa_scan.bash +16 -3
  35. data/scripts/read_quality.bash +6 -2
  36. data/scripts/ssu.bash +19 -1
  37. data/scripts/stats.bash +9 -3
  38. data/scripts/taxonomy.bash +98 -2
  39. data/scripts/trimmed_fasta.bash +10 -2
  40. data/scripts/trimmed_reads.bash +26 -6
  41. data/test/dataset_test.rb +17 -2
  42. data/test/hook_test.rb +3 -2
  43. data/test/net_test.rb +21 -5
  44. data/test/project_test.rb +13 -0
  45. data/test/remote_dataset_test.rb +106 -7
  46. data/test/result_test.rb +47 -21
  47. data/test/taxonomy_test.rb +9 -3
  48. data/utils/distance/runner.rb +3 -1
  49. data/utils/distances.rb +1 -1
  50. metadata +4 -2
@@ -17,10 +17,14 @@ module MiGA::Dataset::Result::Ignore
17
17
  # - project: incompatible project
18
18
  # - noref: incompatible dataset, only for reference
19
19
  # - multi: incompatible dataset, only for multi
20
+ # - nomarkers: incompatible dataset, only for markers
20
21
  # - nonmulti: incompatible dataset, only for nonmulti
21
22
  # - complete: the task is already complete
22
23
  def ignore_reasons
23
- %i[empty inactive upstream force project noref multi nonmulti complete]
24
+ %i[
25
+ empty inactive upstream force project
26
+ noref multi nonmulti nomarkers complete
27
+ ]
24
28
  end
25
29
 
26
30
  ##
@@ -91,9 +95,15 @@ module MiGA::Dataset::Result::Ignore
91
95
  ignore_by_type?(task, :nonmulti)
92
96
  end
93
97
 
98
+ ##
99
+ # Ignore +task+ because it's not a markers dataset
100
+ def ignore_nomarkers?(task)
101
+ ignore_by_type?(task, :nomarkers)
102
+ end
103
+
94
104
  ##
95
105
  # Ignore +task+ by +type+ of dataset, one of: +:noref+, +:multi+, or
96
- # +:nonmulti+
106
+ # +:nonmulti+, +:nomarkers+
97
107
  def ignore_by_type?(task, type)
98
108
  return false if force_task?(task)
99
109
 
@@ -105,6 +115,8 @@ module MiGA::Dataset::Result::Ignore
105
115
  [:multi?, self.class.ONLY_MULTI_TASKS]
106
116
  when :nonmulti
107
117
  [:nonmulti?, self.class.ONLY_NONMULTI_TASKS]
118
+ when :nomarkers
119
+ [:markers?, self.class.EXCLUDE_NOMARKER_TASKS]
108
120
  else
109
121
  raise "Unexpected error, unknown type reason: #{type}"
110
122
  end
@@ -0,0 +1,51 @@
1
+ ##
2
+ # Helper module including specific functions for dataset type
3
+ module MiGA::Dataset::Type
4
+ ##
5
+ # Get the type of dataset as Symbol
6
+ def type
7
+ metadata[:type]
8
+ end
9
+
10
+ ##
11
+ # Is this dataset known to be multi-organism?
12
+ def multi?
13
+ self.class.KNOWN_TYPES.dig(type, :multi)
14
+ end
15
+
16
+ ##
17
+ # Is this dataset known to be single-organism?
18
+ def nonmulti?
19
+ y = self.class.KNOWN_TYPES.dig(type, :multi)
20
+ y.nil? ? nil : !y
21
+ end
22
+
23
+ ##
24
+ # Are universal marker genes expected to be found in this dataset?
25
+ def markers?
26
+ self.class.KNOWN_TYPES.dig(type, :markers)
27
+ end
28
+
29
+ ##
30
+ # Check that the dataset type is defined, known, and compatible with the
31
+ # project type and raise an exception if any of these checks fail
32
+ #
33
+ # If the dataset type is +:empty+, it returns +false+ without raising an
34
+ # exception, and true otherwise (and no tests are failed)
35
+ def check_type
36
+ raise MiGA::Error.new('Undefined dataset type') unless type
37
+ return false if type == :empty
38
+
39
+ unless self.class.KNOWN_TYPES[type]
40
+ raise MiGA::Error.new("Unknown dataset type: #{type}")
41
+ end
42
+ unless self.class.KNOWN_TYPES[type][:project_types].include? project.type
43
+ raise MiGA::Error.new(
44
+ "Dataset type (#{type}) incompatible with project (#{project.type})"
45
+ )
46
+ end
47
+
48
+ true
49
+ end
50
+
51
+ end
data/lib/miga/dataset.rb CHANGED
@@ -6,6 +6,7 @@
6
6
  require 'miga/metadata'
7
7
  require 'miga/dataset/result'
8
8
  require 'miga/dataset/status'
9
+ require 'miga/dataset/type'
9
10
  require 'miga/dataset/hooks'
10
11
 
11
12
  # This library is only required by +#closest_relatives+, so it is now
@@ -18,6 +19,7 @@ require 'miga/dataset/hooks'
18
19
  class MiGA::Dataset < MiGA::MiGA
19
20
  include MiGA::Dataset::Result
20
21
  include MiGA::Dataset::Status
22
+ include MiGA::Dataset::Type
21
23
  include MiGA::Dataset::Hooks
22
24
 
23
25
  # Class-level
@@ -56,6 +58,7 @@ class MiGA::Dataset < MiGA::MiGA
56
58
  name.to_s
57
59
  @project, @name, @metadata = project, name, nil
58
60
  metadata[:ref] = is_ref
61
+ metadata[:type] ||= :empty
59
62
  @metadata_future = [
60
63
  File.join(project.path, 'metadata', "#{name}.json"),
61
64
  metadata
@@ -89,12 +92,6 @@ class MiGA::Dataset < MiGA::MiGA
89
92
  # +Project+ interface
90
93
  alias :save! :save
91
94
 
92
- ##
93
- # Get the type of dataset as Symbol
94
- def type
95
- metadata[:type]
96
- end
97
-
98
95
  ##
99
96
  # Delete the dataset with all it's contents (including results) and returns
100
97
  # nil
@@ -146,22 +143,6 @@ class MiGA::Dataset < MiGA::MiGA
146
143
  !metadata[:ref]
147
144
  end
148
145
 
149
- ##
150
- # Is this dataset known to be multi-organism?
151
- def multi?
152
- return false if metadata[:type].nil? || @@KNOWN_TYPES[type].nil?
153
-
154
- @@KNOWN_TYPES[type][:multi]
155
- end
156
-
157
- ##
158
- # Is this dataset known to be single-organism?
159
- def nonmulti?
160
- return false if metadata[:type].nil? || @@KNOWN_TYPES[type].nil?
161
-
162
- !@@KNOWN_TYPES[type][:multi]
163
- end
164
-
165
146
  ##
166
147
  # Is this dataset active?
167
148
  def active?
data/lib/miga/json.rb CHANGED
@@ -69,5 +69,14 @@ class MiGA::Json < MiGA::MiGA
69
69
  File.open(path, 'w') { |fh| fh.print y } unless path.nil?
70
70
  y
71
71
  end
72
+
73
+ ##
74
+ # Generates and returns plain JSON to represent +obj+.
75
+ # If +path+ is passed, it saves the JSON in that file.
76
+ def generate_plain(obj, path = nil)
77
+ y = JSON.generate(obj)
78
+ File.open(path, 'w') { |fh| fh.print y } unless path.nil?
79
+ y
80
+ end
72
81
  end
73
82
  end
@@ -89,32 +89,36 @@ module MiGA::Project::Base
89
89
  @@KNOWN_TYPES = {
90
90
  mixed: {
91
91
  description: 'Mixed collection of genomes, metagenomes, and viromes',
92
- single: true, multi: true
92
+ single: true, multi: true, markers: true
93
93
  },
94
94
  genomes: {
95
95
  description: 'Collection of genomes',
96
- single: true, multi: false
96
+ single: true, multi: false, markers: true
97
97
  },
98
98
  clade: {
99
99
  description: 'Collection of closely-related genomes (ANI >= 90%)',
100
- single: true, multi: false
100
+ single: true, multi: false, markers: true
101
101
  },
102
102
  metagenomes: {
103
103
  description: 'Collection of metagenomes and/or viromes',
104
- single: false, multi: true
104
+ single: false, multi: true, markers: true
105
+ },
106
+ plasmids: {
107
+ description: 'Collection of plasmids',
108
+ single: true, multi: false, markers: false
105
109
  }
106
110
  }
107
111
 
108
112
  ##
109
113
  # Project-wide distance estimations
110
- @@DISTANCE_TASKS = [
111
- :project_stats, :haai_distances, :aai_distances, :ani_distances,
112
- :clade_finding
114
+ @@DISTANCE_TASKS = %i[
115
+ project_stats haai_distances aai_distances ani_distances
116
+ clade_finding
113
117
  ]
114
118
 
115
119
  ##
116
120
  # Project-wide tasks for :clade projects
117
- @@INCLADE_TASKS = [:subclades, :ogs]
121
+ @@INCLADE_TASKS = %i[subclades ogs]
118
122
 
119
123
  ##
120
124
  # Options supported by projects
@@ -131,7 +135,9 @@ module MiGA::Project::Base
131
135
  },
132
136
  haai_p: {
133
137
  desc: 'Value of aai.rb -p on hAAI', type: String,
134
- default: proc { |project| project.clade? ? 'no' : 'fastaai' },
138
+ default: proc { |project|
139
+ project.clade? || !project.markers? ? 'no' : 'fastaai'
140
+ },
135
141
  in: %w[blast+ blast blat diamond fastaai no]
136
142
  },
137
143
  aai_p: {
data/lib/miga/project.rb CHANGED
@@ -98,7 +98,7 @@ class MiGA::Project < MiGA::MiGA
98
98
  ##
99
99
  # Is this a clade project?
100
100
  def clade?
101
- type == :clade
101
+ %i[clade plasmids].include? type
102
102
  end
103
103
 
104
104
  ##
@@ -115,6 +115,12 @@ class MiGA::Project < MiGA::MiGA
115
115
  # Same as multi? For backward compatibility
116
116
  alias is_multi? multi?
117
117
 
118
+ ##
119
+ # Does the project support the use of universal markers?
120
+ def markers?
121
+ @@KNOWN_TYPES[type][:markers]
122
+ end
123
+
118
124
  ##
119
125
  # Is this project active? Currently a dummy function, returns
120
126
  # always true.
@@ -1,4 +1,3 @@
1
- require 'open-uri'
2
1
  require 'cgi'
3
2
 
4
3
  class MiGA::RemoteDataset < MiGA::MiGA
@@ -10,13 +9,24 @@ class MiGA::RemoteDataset < MiGA::MiGA
10
9
  end
11
10
  end
12
11
 
12
+ def uri_safe_join(*parts)
13
+ safe = parts.map { |i| i.is_a?(Array) ? i.join(',') : i.to_s }
14
+ last = safe.pop
15
+ safe.map! { |i| i[-1] == '/' ? i : "#{i}/" }
16
+ safe << last
17
+ URI::join(*safe)
18
+ end
19
+
13
20
  module MiGA::RemoteDataset::Base
14
- @@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
15
- @@_EBI_API = 'https://www.ebi.ac.uk/Tools'
16
- @@_GTDB_API = 'https://api.gtdb.ecogenomic.org'
17
- @@_SEQCODE_API = 'https://disc-genomics.uibk.ac.at/seqcode'
18
- @@_NCBI_API_KEY = lambda { |url|
19
- ENV['NCBI_API_KEY'].nil? ? url : "#{url}&api_key=#{ENV['NCBI_API_KEY']}"
21
+ @@_NCBI_DATASETS = 'https://api.ncbi.nlm.nih.gov/datasets/v2alpha/'
22
+ @@_EUTILS = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
23
+ @@_EBI_API = 'https://www.ebi.ac.uk/Tools/'
24
+ @@_GTDB_API = 'https://api.gtdb.ecogenomic.org/'
25
+ @@_SEQCODE_API = 'https://disc-genomics.uibk.ac.at/seqcode/'
26
+ @@_EUTILS_BUILD = lambda { |service, q|
27
+ q[:api_key] = ENV['NCBI_API_KEY'] if ENV['NCBI_API_KEY']
28
+ uri_safe_join(@@_EUTILS, "#{service}.fcgi")
29
+ .tap { |uri| uri.query = URI.encode_www_form(q) }
20
30
  }
21
31
 
22
32
  ##
@@ -25,15 +35,13 @@ module MiGA::RemoteDataset::Base
25
35
  # supported keys as Symbol:
26
36
  # - +:dbs+ => Hash with keys being the database name and the values a Hash of
27
37
  # properties such as +stage+, +format+, +map_to+, and +getter+.
28
- # - +url+ => Pattern of the URL where the data can be obtained, where +%1$s+
29
- # is the name of the database, +%2$s+ is the IDs, and +%3$s+ is format.
30
- # Additional parameters can be passed to certain functions using the +extra+
31
- # option.
38
+ # - +uri+ => Function producing a parsed URI object, accepting one parameter:
39
+ # a Hash of options.
32
40
  # - +method+ => Method used to query the URL. Only +:rest+ and +:net+ are
33
41
  # currently supported.
34
- # - +api_key+ => A lambda function that takes a URL as input and returns the
35
- # URL to be downloaded with an API Key (if available).
36
42
  # - +map_to_universe+ => Universe where results map to. Currently unsupported.
43
+ # - +scheme+ => Function returning the scheme used as a String (ftp, http,
44
+ # https). Mandatory if method is :net.
37
45
  @@UNIVERSE = {
38
46
  web: {
39
47
  dbs: {
@@ -41,13 +49,18 @@ module MiGA::RemoteDataset::Base
41
49
  assembly_gz: { stage: :assembly, format: :fasta_gz },
42
50
  text: { stage: :metadata, format: :text }
43
51
  },
44
- url: '%2$s',
52
+ uri: lambda { |opts| URI.parse(opts[:ids][0]) },
53
+ scheme: lambda { |opts| opts[:ids][0].split(':', 2)[0] },
45
54
  method: :net
46
55
  },
47
56
  ebi: {
48
57
  dbs: { embl: { stage: :assembly, format: :fasta } },
49
- url: "#{@@_EBI_API}/dbfetch/dbfetch/%1$s/%2$s/%3$s",
50
- method: :rest
58
+ uri: lambda do |opts|
59
+ uri_safe_join(
60
+ @@_EBI_API, 'dbfetch', 'dbfetch', opts[:db], opts[:ids], opts[:format]
61
+ )
62
+ end,
63
+ method: :get
51
64
  },
52
65
  gtdb: {
53
66
  dbs: {
@@ -56,15 +69,18 @@ module MiGA::RemoteDataset::Base
56
69
  # The 'taxon' namespace actually returns a list of genomes (+format+)
57
70
  taxon: {
58
71
  stage: :metadata, format: :genomes, map_to: [:assembly],
59
- extra: ['sp_reps_only=false']
72
+ extra: { sp_reps_only: false }
60
73
  },
61
74
  # The 'genome' namespace actually returns the taxonomy (+format+)
62
75
  genome: { stage: :metadata, format: 'taxon-history' }
63
76
  },
64
- url: "#{@@_GTDB_API}/%1$s/%2$s/%3$s?%4$s",
65
- method: :rest,
77
+ uri: lambda do |opts|
78
+ uri_safe_join(@@_GTDB_API, opts[:db], opts[:ids], opts[:format])
79
+ .tap { |uri| uri.query = URI.encode_www_form(opts[:extra]) }
80
+ end,
81
+ method: :get,
66
82
  map_to_universe: :ncbi,
67
- headers: 'accept: application/json' # < TODO not currently supported
83
+ headers: lambda { |_opts| { 'Accept' => 'application/json' } }
68
84
  },
69
85
  seqcode: {
70
86
  dbs: {
@@ -74,8 +90,11 @@ module MiGA::RemoteDataset::Base
74
90
  # This is the list of type genomes
75
91
  :'type-genomes' => { stage: :metadata, format: :json }
76
92
  },
77
- url: "#{@@_SEQCODE_API}/%1$s.json?%4$s",
78
- method: :rest,
93
+ uri: lambda do |opts|
94
+ uri_safe_join(@@_SEQCODE_API, "#{opts[:db]}.json")
95
+ .tap { |uri| uri.query = URI.encode_www_form(opts[:extra]) }
96
+ end,
97
+ method: :get,
79
98
  map_to_universe: :ncbi
80
99
  },
81
100
  ncbi: {
@@ -84,9 +103,12 @@ module MiGA::RemoteDataset::Base
84
103
  assembly: { stage: :assembly, format: :fasta_gz, getter: :ncbi_asm },
85
104
  taxonomy: { stage: :metadata, format: :xml }
86
105
  },
87
- url: "#{@@_EUTILS}efetch.fcgi?db=%1$s&id=%2$s&rettype=%3$s&retmode=text",
88
- method: :rest,
89
- api_key: @@_NCBI_API_KEY
106
+ uri: lambda do |opts|
107
+ @@_EUTILS_BUILD[:efetch,
108
+ db: opts[:db], id: opts[:ids], rettype: opts[:format], retmode: :text
109
+ ]
110
+ end,
111
+ method: :get
90
112
  },
91
113
  ncbi_map: {
92
114
  dbs: {
@@ -95,22 +117,81 @@ module MiGA::RemoteDataset::Base
95
117
  },
96
118
  biosample: { stage: :metadata, map_to: [:assembly], format: :json }
97
119
  },
98
- url: "#{@@_EUTILS}elink.fcgi?dbfrom=%1$s&id=%2$s&db=%4$s&retmode=%3$s",
99
- method: :net,
100
- map_to_universe: :ncbi,
101
- api_key: @@_NCBI_API_KEY
120
+ uri: lambda do |opts|
121
+ @@_EUTILS_BUILD[:elink, {
122
+ dbfrom: opts[:db], id: opts[:ids], retmode: opts[:format]
123
+ }.merge(opts[:extra])]
124
+ end,
125
+ method: :get,
126
+ map_to_universe: :ncbi
102
127
  },
103
128
  ncbi_summary: {
104
129
  dbs: { assembly: { stage: :metadata, format: :json } },
105
- url: "#{@@_EUTILS}esummary.fcgi?db=%1$s&id=%2$s&retmode=%3$s",
106
- method: :rest,
107
- api_key: @@_NCBI_API_KEY
130
+ uri: lambda do |opts|
131
+ @@_EUTILS_BUILD[:esummary,
132
+ db: opts[:db], id: opts[:ids], retmode: opts[:format]
133
+ ]
134
+ end,
135
+ method: :get
108
136
  },
109
137
  ncbi_search: {
110
- dbs: { assembly: { stage: :metadata, format: :json } },
111
- url: "#{@@_EUTILS}esearch.fcgi?db=%1$s&term=%2$s&retmode=%3$s",
112
- method: :rest,
113
- api_key: @@_NCBI_API_KEY
138
+ dbs: {
139
+ assembly: { stage: :metadata, format: :json },
140
+ taxonomy: { stage: :metadata, format: :json }
141
+ },
142
+ uri: lambda do |opts|
143
+ @@_EUTILS_BUILD[:esearch,
144
+ db: opts[:db], term: opts[:ids], retmode: opts[:format]
145
+ ]
146
+ end,
147
+ method: :get
148
+ },
149
+ ncbi_datasets_download: {
150
+ dbs: { genome: { stage: :assembly, format: :zip } },
151
+ uri: lambda do |opts|
152
+ q = { include_annotation_type: 'GENOME_FASTA' }
153
+ uri_safe_join(
154
+ @@_NCBI_DATASETS, opts[:db], :accession, opts[:ids], :download
155
+ ).tap { |uri| uri.query = URI.encode_www_form(q) }
156
+ end,
157
+ method: :get,
158
+ headers: lambda do |opts|
159
+ {}.tap do |h|
160
+ h['Accept'] = 'application/zip' if opts[:format] == :zip
161
+ h['api-key'] = ENV['NCBI_API_KEY'] if ENV['NCBI_API_KEY']
162
+ end
163
+ end
164
+ },
165
+ ncbi_datasets: {
166
+ dbs: {
167
+ genome: {
168
+ stage: :metadata, format: :json, extra: { action: 'dataset_report' }
169
+ }
170
+ },
171
+ uri: lambda do |opts|
172
+ uri_safe_join(@@_NCBI_DATASETS, opts[:db], opts[:extra][:action])
173
+ end,
174
+ payload: lambda do |opts|
175
+ query = opts[:ids][0]
176
+ q = {
177
+ filters: {
178
+ assembly_version: 'current',
179
+ exclude_paired_reports: true
180
+ }.merge(query[:filters] || {}),
181
+ page_size: query[:page_size] || 1_000,
182
+ returned_content: 'COMPLETE'
183
+ }
184
+ q[:page_token] = query[:page_token] if query[:page_token]
185
+ q[:taxons] = query[:taxons] if query[:taxons]
186
+ MiGA::Json.generate_plain(q)
187
+ end,
188
+ headers: lambda do |opts|
189
+ {}.tap do |h|
190
+ h['api-key'] = ENV['NCBI_API_KEY'] if ENV['NCBI_API_KEY']
191
+ h['Content-Type'] = 'application/json' if opts[:format] == :json
192
+ end
193
+ end,
194
+ method: :post
114
195
  }
115
196
  }
116
197
  end