miga-base 0.7.21.0 → 0.7.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/README.md +1 -1
  4. data/Rakefile +1 -0
  5. data/lib/miga/cli/action/add.rb +1 -2
  6. data/lib/miga/cli/action/classify_wf.rb +12 -11
  7. data/lib/miga/cli/action/derep_wf.rb +3 -9
  8. data/lib/miga/cli/action/edit.rb +0 -1
  9. data/lib/miga/cli/action/find.rb +1 -1
  10. data/lib/miga/cli/action/generic.rb +1 -1
  11. data/lib/miga/cli/action/get.rb +7 -2
  12. data/lib/miga/cli/action/get_db.rb +16 -21
  13. data/lib/miga/cli/action/init.rb +41 -93
  14. data/lib/miga/cli/action/init/daemon_helper.rb +1 -2
  15. data/lib/miga/cli/action/init/files_helper.rb +118 -0
  16. data/lib/miga/cli/action/ncbi_get.rb +1 -1
  17. data/lib/miga/cli/action/new.rb +15 -9
  18. data/lib/miga/cli/action/option.rb +44 -0
  19. data/lib/miga/cli/action/quality_wf.rb +3 -3
  20. data/lib/miga/cli/action/tax_dist.rb +1 -1
  21. data/lib/miga/cli/action/tax_test.rb +1 -1
  22. data/lib/miga/cli/action/wf.rb +32 -30
  23. data/lib/miga/cli/base.rb +1 -0
  24. data/lib/miga/cli/objects_helper.rb +23 -18
  25. data/lib/miga/common.rb +4 -2
  26. data/lib/miga/common/net.rb +74 -0
  27. data/lib/miga/common/with_option.rb +83 -0
  28. data/lib/miga/common/with_result.rb +3 -2
  29. data/lib/miga/dataset/base.rb +20 -2
  30. data/lib/miga/dataset/result.rb +3 -2
  31. data/lib/miga/metadata.rb +25 -13
  32. data/lib/miga/project/base.rb +82 -2
  33. data/lib/miga/project/result.rb +4 -4
  34. data/lib/miga/remote_dataset.rb +2 -0
  35. data/lib/miga/result/stats.rb +2 -2
  36. data/lib/miga/version.rb +4 -2
  37. data/scripts/aai_distances.bash +1 -1
  38. data/scripts/ani_distances.bash +1 -1
  39. data/scripts/essential_genes.bash +1 -2
  40. data/scripts/haai_distances.bash +1 -1
  41. data/scripts/mytaxa.bash +6 -5
  42. data/scripts/mytaxa_scan.bash +8 -7
  43. data/scripts/ogs.bash +2 -3
  44. data/scripts/ssu.bash +16 -2
  45. data/test/dataset_test.rb +5 -5
  46. data/test/net_test.rb +34 -0
  47. data/test/with_option_test.rb +115 -0
  48. data/utils/cleanup-databases.rb +2 -3
  49. data/utils/distance/commands.rb +2 -2
  50. data/utils/distance/database.rb +1 -1
  51. data/utils/distance/pipeline.rb +2 -4
  52. data/utils/distance/runner.rb +15 -23
  53. data/utils/index_metadata.rb +1 -2
  54. data/utils/requirements.txt +6 -5
  55. data/utils/subclade/runner.rb +10 -11
  56. metadata +9 -3
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/ftp'
4
+ require 'open-uri'
5
+ require 'fileutils'
6
+
7
+ Net::FTP.send(:remove_const, 'FTP_PORT') # just to avoid warnings
8
+ Net::FTP.const_set('FTP_PORT', 21)
9
+
10
+ ##
11
+ # General web-access functions shared throughout MiGA.
12
+ module MiGA::Common::Net
13
+ ##
14
+ # Returns the URL of the host +name+ (Symbol)
15
+ def known_hosts(name)
16
+ case name.to_sym
17
+ when :miga_online_ftp
18
+ 'ftp://microbial-genomes.org//' # <- // to simplify chdir in connection
19
+ when :miga_db
20
+ 'ftp://microbial-genomes.org/db'
21
+ when :miga_dist
22
+ 'ftp://microbial-genomes.org/dist'
23
+ else
24
+ raise "Unrecognized server name: #{host}"
25
+ end
26
+ end
27
+
28
+ ##
29
+ # Connect to an FTP +host+ (String) or a known host name (Symbol, see
30
+ # +.known_hosts+)
31
+ def remote_connection(host)
32
+ host = known_hosts(host) if host.is_a?(Symbol)
33
+ uri = URI.parse(host)
34
+ raise 'Only FTP hosts are currently supported' unless uri.scheme == 'ftp'
35
+
36
+ ftp = Net::FTP.new(uri.host)
37
+ ftp.passive = true
38
+ ftp.login
39
+ ftp.chdir(uri.path)
40
+ ftp
41
+ end
42
+
43
+ ##
44
+ # Download a file via FTP using the +connection+ (returned by
45
+ # +.remote_connection+) with remote name +file+ into local +target+.
46
+ #
47
+ # Alternatively, +connection+ can simply be the host (String) or a recognized
48
+ # Symbol (see +.remote_connection+), in which case the function opens the
49
+ # connection automatically
50
+ #
51
+ # Reports progress to the function block with two arguments: the
52
+ # currently transferred size and the total file size
53
+ def download_file_ftp(connection, file, target)
54
+ # Open connection unless passed
55
+ close_conn = false
56
+ if connection.is_a?(String) || connection.is_a?(Symbol)
57
+ connection = remote_connection(connection)
58
+ close_conn = true
59
+ end
60
+
61
+ # Prepare download
62
+ FileUtils.mkdir_p(File.dirname(target))
63
+ filesize = connection.size(file)
64
+ transferred = 0
65
+
66
+ # Get in chunks of 1KiB
67
+ connection.getbinaryfile(file, target, 1024) do |data|
68
+ yield(transferred += data.size, filesize) if block_given?
69
+ end
70
+
71
+ # Close connection if automatically opened
72
+ connection.close if close_conn
73
+ end
74
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ ##
4
+ # Helper module including specific functions to handle objects that
5
+ # have configurable options. The class including this module must implement
6
+ # the methods +.OPTIONS+, +#metadata+, and +#save+.
7
+ module MiGA::Common::WithOption
8
+ def option(key)
9
+ assert_has_option(key)
10
+ opt = option_by_metadata(key)
11
+ value = opt.nil? ? option_by_default(key) : opt
12
+ value = value[self] if value.is_a?(Proc)
13
+ value
14
+ end
15
+
16
+ def set_option(key, value, from_string = false)
17
+ metadata[key] = assert_valid_option_value(key, value, from_string)
18
+ save
19
+ option(key)
20
+ end
21
+
22
+ def all_options
23
+ Hash[self.class.OPTIONS.each_key.map { |key| [key, option(key)] }]
24
+ end
25
+
26
+ def option?(key)
27
+ !self.class.OPTIONS[key.to_sym].nil?
28
+ end
29
+
30
+ def option_by_metadata(key)
31
+ metadata[key]
32
+ end
33
+
34
+ def option_by_default(key)
35
+ self.class.OPTIONS[key.to_sym][:default]
36
+ end
37
+
38
+ def assert_has_option(key)
39
+ opt = self.class.OPTIONS[key.to_sym]
40
+ raise "Unrecognized option: #{key}" if opt.nil?
41
+ opt
42
+ end
43
+
44
+ def assert_valid_option_value(key, value, from_string = false)
45
+ opt = assert_has_option(key)
46
+ value = option_from_string(key, value) if from_string
47
+
48
+ # nil is always valid, and so are supported tokens
49
+ return value if value.nil? || opt[:tokens]&.include?(value)
50
+
51
+ if opt[:type] && !value.is_a?(opt[:type])
52
+ raise "Invalid value type for #{key}: #{value.class}, not #{opt[:type]}"
53
+ end
54
+
55
+ if opt[:in] && !opt[:in].include?(value)
56
+ raise "Value out of range for #{key}: #{value}, not #{opt[:in]}"
57
+ end
58
+
59
+ value
60
+ end
61
+
62
+ def option_from_string(key, value)
63
+ opt = assert_has_option(key)
64
+
65
+ if ['', 'nil'].include?(value)
66
+ nil
67
+ elsif opt[:tokens]&.include?(value)
68
+ value
69
+ elsif opt[:type]&.equal?(Float)
70
+ raise "Not a float: #{value}" unless value =~ /^-?\.?\d/
71
+ value.to_f
72
+ elsif opt[:type]&.equal?(Integer)
73
+ raise "Not an integer: #{value}" unless value =~ /^-?\d/
74
+ value.to_i
75
+ elsif opt[:in]&.include?(true) && value == 'true'
76
+ true
77
+ elsif opt[:in]&.include?(false) && value == 'false'
78
+ false
79
+ else
80
+ value
81
+ end
82
+ end
83
+ end
@@ -86,7 +86,8 @@ module MiGA::Common::WithResult
86
86
  if res.nil?
87
87
  # Run if the step has not been calculated,
88
88
  # unless too many attempts were already made
89
- if (metadata["_try_#{t}"] || 0) > (project.metadata[:max_try] || 10)
89
+ cur_try = metadata["_try_#{t}"] || 0
90
+ if cur_try > project.option(:max_try)
90
91
  inactivate! "Too many errors in step #{t}"
91
92
  false
92
93
  else
@@ -103,7 +104,7 @@ module MiGA::Common::WithResult
103
104
  ##
104
105
  # Mark all results for recalculation
105
106
  def recalculate_tasks(reason = nil)
106
- each_result { |res| res.recalculate!(reason).save }
107
+ each_result { |_k, res| res.recalculate!(reason).save }
107
108
  end
108
109
 
109
110
  end
@@ -1,7 +1,10 @@
1
- # @package MiGA
2
- # @license Artistic-2.0
1
+ # frozen_string_literal: true
2
+
3
+ require 'miga/common/with_option'
3
4
 
4
5
  class MiGA::Dataset < MiGA::MiGA
6
+ include MiGA::Common::WithOption
7
+
5
8
  # Class-level
6
9
  class << self
7
10
  def RESULT_DIRS
@@ -15,6 +18,10 @@ class MiGA::Dataset < MiGA::MiGA
15
18
  def PREPROCESSING_TASKS
16
19
  @@PREPROCESSING_TASKS
17
20
  end
21
+
22
+ def OPTIONS
23
+ @@OPTIONS
24
+ end
18
25
  end
19
26
  end
20
27
 
@@ -85,4 +92,15 @@ module MiGA::Dataset::Base
85
92
  # tasks are ignored for single-organism datasets or for unknwon types.
86
93
  @@ONLY_MULTI_TASKS = [:mytaxa]
87
94
  @@_ONLY_MULTI_TASKS_H = Hash[@@ONLY_MULTI_TASKS.map { |i| [i, true] }]
95
+
96
+ ##
97
+ # Options supported by datasets
98
+ @@OPTIONS = {
99
+ db_project: {
100
+ desc: 'Project to use as database', type: String
101
+ },
102
+ dist_req: {
103
+ desc: 'Run distances against these datasets', type: Array, default: []
104
+ }
105
+ }
88
106
  end
@@ -50,7 +50,7 @@ module MiGA::Dataset::Result
50
50
  :upstream
51
51
  elsif !metadata["run_#{task}"].nil?
52
52
  metadata["run_#{task}"] ? :execute : :force
53
- elsif task == :taxonomy && project.metadata[:ref_project].nil?
53
+ elsif task == :taxonomy && project.option(:ref_project).nil?
54
54
  :project
55
55
  elsif @@_EXCLUDE_NOREF_TASKS_H[task] && !ref?
56
56
  :noref
@@ -290,7 +290,8 @@ module MiGA::Dataset::Result
290
290
  MiGA::Result.new("#{base}.json"), name,
291
291
  longest_ssu_gene: '.ssu.fa',
292
292
  gff: '.ssu.gff',
293
- all_ssu_genes: '.ssu.all.fa'
293
+ all_ssu_genes: '.ssu.all.fa',
294
+ classification: '.rdp.tsv'
294
295
  )
295
296
  opts[:is_clean] ||= false
296
297
  r.clean! if opts[:is_clean]
data/lib/miga/metadata.rb CHANGED
@@ -56,24 +56,20 @@ class MiGA::Metadata < MiGA::MiGA
56
56
  ##
57
57
  # Save the metadata into #path
58
58
  def save
59
- MiGA.DEBUG "Metadata.save #{path}"
59
+ return if self[:never_save]
60
+
61
+ MiGA::MiGA.DEBUG "Metadata.save #{path}"
60
62
  self[:updated] = Time.now.to_s
61
63
  json = to_json
62
- sleeper = 0.0
63
- slept = 0
64
- while File.exist?(lock_file)
65
- MiGA::MiGA.DEBUG "Waiting for lock: #{lock_file}"
66
- sleeper += 0.1 if sleeper <= 10.0
67
- sleep(sleeper.to_i)
68
- slept += sleeper.to_i
69
- raise "Lock detected for over 10 minutes: #{lock_file}" if slept > 600
70
- end
71
- FileUtils.touch lock_file
64
+ wait_for_lock
65
+ FileUtils.touch(lock_file)
72
66
  ofh = File.open("#{path}.tmp", 'w')
73
67
  ofh.puts json
74
68
  ofh.close
75
- raise "Lock-racing detected for #{path}" unless
76
- File.exist?("#{path}.tmp") and File.exist?(lock_file)
69
+
70
+ unless File.exist?("#{path}.tmp") && File.exist?(lock_file)
71
+ raise "Lock-racing detected for #{path}"
72
+ end
77
73
 
78
74
  File.rename("#{path}.tmp", path)
79
75
  File.unlink(lock_file)
@@ -154,4 +150,20 @@ class MiGA::Metadata < MiGA::MiGA
154
150
  def to_json
155
151
  MiGA::Json.generate(data)
156
152
  end
153
+
154
+ private
155
+
156
+ ##
157
+ # Wait for the lock to go away
158
+ def wait_for_lock
159
+ sleeper = 0.0
160
+ slept = 0.0
161
+ while File.exist?(lock_file)
162
+ MiGA::MiGA.DEBUG "Waiting for lock: #{lock_file}"
163
+ sleeper += 0.1 if sleeper <= 10.0
164
+ sleep(sleeper)
165
+ slept += sleeper
166
+ raise "Lock detected for over 10 minutes: #{lock_file}" if slept > 600
167
+ end
168
+ end
157
169
  end
@@ -1,7 +1,10 @@
1
- # @package MiGA
2
- # @license Artistic-2.0
1
+ # frozen_string_literal: true
2
+
3
+ require 'miga/common/with_option'
3
4
 
4
5
  class MiGA::Project < MiGA::MiGA
6
+ include MiGA::Common::WithOption
7
+
5
8
  class << self
6
9
  ##
7
10
  # Does the project at +path+ exist?
@@ -33,6 +36,10 @@ class MiGA::Project < MiGA::MiGA
33
36
  def RESULT_DIRS
34
37
  @@RESULT_DIRS
35
38
  end
39
+
40
+ def OPTIONS
41
+ @@OPTIONS
42
+ end
36
43
  end
37
44
  end
38
45
 
@@ -108,4 +115,77 @@ module MiGA::Project::Base
108
115
  ##
109
116
  # Project-wide tasks for :clade projects
110
117
  @@INCLADE_TASKS = [:subclades, :ogs]
118
+
119
+ ##
120
+ # Options supported by projects
121
+ @@OPTIONS = {
122
+ ref_project: {
123
+ desc: 'Project with reference taxonomy', type: String
124
+ },
125
+ db_proj_dir: {
126
+ desc: 'Directory containing database projects', type: String
127
+ },
128
+ tax_pvalue: {
129
+ desc: 'Maximum p-value to transfer taxonomy', default: 0.05, type: Float,
130
+ in: 0.0..1.0
131
+ },
132
+ haai_p: {
133
+ desc: 'Value of aai.rb -p on hAAI', type: String,
134
+ default: proc { |project| project.clade? ? 'no' : 'blast+' },
135
+ in: %w[blast+ blast blat diamond no]
136
+ },
137
+ aai_p: {
138
+ desc: 'Value of aai.rb -p on AAI', default: 'blast+', type: String,
139
+ in: %w[blast+ blast blat diamond]
140
+ },
141
+ ani_p: {
142
+ desc: 'Value of ani.rb -p on ANI', default: 'blast+', type: String,
143
+ in: %w[blast+ blast blat fastani]
144
+ },
145
+ max_try: {
146
+ desc: 'Maximum number of task attempts', default: 10, type: Integer,
147
+ in: (0..1000)
148
+ },
149
+ aai_save_rbm: {
150
+ desc: 'Should RBMs be saved for OGS analysis?',
151
+ default: proc { |project| project.clade? },
152
+ in: [true, false]
153
+ },
154
+ ogs_identity: {
155
+ desc: 'Min RBM identity for OGS', default: 80.0, type: Float,
156
+ in: (0.0..100.0)
157
+ },
158
+ clean_ogs: {
159
+ desc: 'If false, keeps ABC files (clades only)', default: true,
160
+ in: [true, false]
161
+ },
162
+ run_clades: {
163
+ desc: 'Should clades be estimated from distances?', default: true,
164
+ in: [true, false]
165
+ },
166
+ gsp_ani: {
167
+ desc: 'ANI limit to propose gsp clades', default: 95.0, type: Float,
168
+ in: (0.0..100.0)
169
+ },
170
+ gsp_aai: {
171
+ desc: 'AAI limit to propose gsp clades', default: 90.0, type: Float,
172
+ in: (0.0..100.0)
173
+ },
174
+ gsp_metric: {
175
+ desc: 'Metric to propose clades', default: 'ani', type: String,
176
+ in: %w[ani aai]
177
+ },
178
+ ess_coll: {
179
+ desc: 'Collection of essential genes to use', default: 'dupont_2012',
180
+ type: String, in: %w[dupont_2012 lee_2019]
181
+ },
182
+ min_qual: {
183
+ desc: 'Minimum genome quality', default: 25.0, type: Float,
184
+ in: -Float::INFINITY..100.0, tokens: %w[no]
185
+ },
186
+ distances_checkpoint: {
187
+ desc: 'Number of comparisons before storing data', default: 10,
188
+ type: Integer, in: 1...Float::INFINITY
189
+ }
190
+ }
111
191
  end
@@ -31,9 +31,9 @@ module MiGA::Project::Result
31
31
  ##
32
32
  # Is this +task+ to be bypassed?
33
33
  def ignore_task?(task)
34
- metadata["run_#{task}"] == false ||
35
- (!is_clade? && @@INCLADE_TASKS.include?(task) &&
36
- metadata["run_#{task}"] != true)
34
+ return true if metadata["run_#{task}"] == false
35
+
36
+ !clade? && @@INCLADE_TASKS.include?(task) && metadata["run_#{task}"] != true
37
37
  end
38
38
 
39
39
  ##
@@ -74,7 +74,7 @@ module MiGA::Project::Result
74
74
  return r
75
75
  end
76
76
  return nil unless result_files_exist?(base, %w[.proposed-clades])
77
- unless is_clade? ||
77
+ unless clade? ||
78
78
  result_files_exist?(
79
79
  base, %w[.pdf .classif .medoids .class.tsv .class.nwk]
80
80
  )
@@ -162,6 +162,8 @@ class MiGA::RemoteDataset < MiGA::MiGA
162
162
  txt.empty? ? sleep(1) : break
163
163
  end
164
164
  doc = MiGA::Json.parse(txt, symbolize: false, contents: true)
165
+ return if doc.nil? || doc['result'].nil? || doc['result'].empty?
166
+
165
167
  @_ncbi_asm_json_doc = doc['result'][ doc['result']['uids'].first ]
166
168
  end
167
169
 
@@ -118,7 +118,7 @@ module MiGA::Result::Stats
118
118
 
119
119
  def compute_stats_essential_genes
120
120
  stats = {}
121
- if source.is_multi?
121
+ if source.multi?
122
122
  stats = { median_copies: 0, mean_copies: 0 }
123
123
  File.open(file_path(:report), 'r') do |fh|
124
124
  fh.each_line do |ln|
@@ -151,7 +151,7 @@ module MiGA::Result::Stats
151
151
  source.save
152
152
 
153
153
  # Inactivate low-quality datasets
154
- min_qual = (project.metadata[:min_qual] || 25)
154
+ min_qual = project.option(:min_qual)
155
155
  if min_qual != 'no' && stats[:quality] < min_qual
156
156
  source.inactivate! 'Low quality genome'
157
157
  end
data/lib/miga/version.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'date'
2
4
 
3
5
  ##
@@ -8,7 +10,7 @@ module MiGA
8
10
  # - Float representing the major.minor version.
9
11
  # - Integer representing gem releases of the current version.
10
12
  # - Integer representing minor changes that require new version number.
11
- VERSION = [0.7, 21, 0]
13
+ VERSION = [0.7, 24, 0].freeze
12
14
 
13
15
  ##
14
16
  # Nickname for the current major.minor version.
@@ -16,7 +18,7 @@ module MiGA
16
18
 
17
19
  ##
18
20
  # Date of the current gem release.
19
- VERSION_DATE = Date.new(2021, 1, 13)
21
+ VERSION_DATE = Date.new(2021, 2, 16)
20
22
 
21
23
  ##
22
24
  # Reference of MiGA.