miga-base 0.7.24.0 → 0.7.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,4 +2,13 @@ require 'miga'
2
2
  require 'miga/tax_dist'
3
3
 
4
4
  class MiGA::DistanceRunner
5
+ require_relative 'temporal.rb'
6
+ require_relative 'database.rb'
7
+ require_relative 'commands.rb'
8
+ require_relative 'pipeline.rb'
9
+
10
+ include MiGA::DistanceRunner::Temporal
11
+ include MiGA::DistanceRunner::Database
12
+ include MiGA::DistanceRunner::Commands
13
+ include MiGA::DistanceRunner::Pipeline
5
14
  end
@@ -1,105 +1,207 @@
1
1
  module MiGA::DistanceRunner::Commands
2
- # Estimates or calculates AAI against +target+
3
- def aai(target)
4
- # Check if the request makes sense
5
- return nil if target.nil? || target.result(:essential_genes).nil?
6
-
7
- # Check if it's been calculated
8
- y = stored_value(target, :aai)
9
- return y unless y.nil? || y.zero?
10
-
11
- # Try hAAI (except in clade projects)
12
- unless @ref_project.clade?
13
- y = haai(target)
14
- return y unless y.nil? || y.zero?
2
+ ##
3
+ # Estimates AAI against +targets+ using hAAI
4
+ def haai(targets)
5
+ puts "[#{Time.now}] hAAI: #{dataset.name} vs #{targets.size} targets"
6
+ empty_vals = targets.map { |_i| nil }
7
+ return empty_vals if opts[:haai_p] == 'no'
8
+
9
+ # Launch comparisons
10
+ sbj = pending_targets(targets, :haai)
11
+ unless sbj.empty?
12
+ opts[:haai_p] == 'fastaai' ? fastaai_cmd(sbj) : haai_cmd(sbj)
15
13
  end
16
- # Full AAI
17
- aai_cmd(
18
- tmp_file('proteins.fa'), target.result(:cds).file_path(:proteins),
19
- dataset.name, target.name, tmp_dbs[:aai]
20
- ).tap { checkpoint :aai }
14
+
15
+ # Return AAI estimates from the database
16
+ batch_values_from_db(:aai, targets.map { |i| i&.name })
21
17
  end
22
18
 
23
19
  ##
24
- # Estimates AAI against +target+ using hAAI
25
- def haai(target)
26
- return nil if opts[:haai_p] == 'no'
27
-
28
- haai = aai_cmd(tmp_file('ess_genes.fa'),
29
- target.result(:essential_genes).file_path(:ess_genes),
30
- dataset.name, target.name, tmp_dbs[:haai],
31
- aai_save_rbm: 'no-save-rbm', aai_p: opts[:haai_p])
32
- checkpoint :haai
33
- return nil if haai.nil? || haai.zero? || haai > 90.0
34
-
35
- aai = 100.0 - Math.exp(2.435076 + 0.4275193 * Math.log(100.0 - haai))
36
- SQLite3::Database.new(tmp_dbs[:aai]) do |conn|
37
- conn.execute 'insert into aai values(?, ?, ?, 0, 0, 0)',
38
- [dataset.name, target.name, aai]
20
+ # Estimates or calculates AAI against +targets+
21
+ def aai(targets)
22
+ puts "[#{Time.now}] AAI: #{dataset.name} vs #{targets.size} targets"
23
+
24
+ # Try hAAI first
25
+ haai(targets)
26
+
27
+ # Launch comparisons
28
+ pending_targets(targets, :aai).each do |target|
29
+ # Full AAI
30
+ target_cds = target.result(:cds).file_path(:proteins) or next
31
+ aairb_cmd(
32
+ tmp_file('proteins.fa'), target_cds,
33
+ dataset.name, target.name, tmp_dbs[:aai], checkpoint: :aai
34
+ )
39
35
  end
40
- checkpoint :aai
41
- aai
36
+
37
+ # Return AAI from the database
38
+ batch_values_from_db(:aai, targets.map { |i| i&.name })
42
39
  end
43
40
 
44
41
  ##
45
- # Calculates ANI against +target+
46
- def ani(target)
47
- # Check if the request makes sense
48
- t = tmp_file('largecontigs.fa')
49
- r = target.result(:assembly)
50
- return nil if r.nil? || !File.size?(t)
51
-
52
- # Check if it's been calculated
53
- y = stored_value(target, :ani)
54
- return y unless y.nil? || y.zero?
55
-
56
- # Run it
57
- ani_cmd(
58
- t, r.file_path(:largecontigs),
59
- dataset.name, target.name, tmp_dbs[:ani]
60
- ).tap { checkpoint :ani }
42
+ # Calculates ANI against +targets+
43
+ def ani(targets)
44
+ puts "[#{Time.now}] ANI: #{dataset.name} vs #{targets.size} targets"
45
+ empty_vals = targets.map { |_i| nil }
46
+ return empty_vals unless File.size?(tmp_file('largecontigs.fa'))
47
+
48
+ # Launch comparisons
49
+ sbj = pending_targets(targets, :ani)
50
+ unless sbj.empty?
51
+ opts[:ani_p] == 'fastani' ? fastani_cmd(sbj) : anirb_cmd(sbj)
52
+ end
53
+
54
+ # Return ANI from the database
55
+ batch_values_from_db(:ani, targets.map { |i| i&.name })
61
56
  end
62
57
 
63
58
  ##
64
- # Calculates and returns ANI against +target+ if AAI >= +aai_limit+.
65
- # Returns +nil+ otherwise
66
- def ani_after_aai(target, aai_limit = 85.0)
67
- aai = aai(target)
68
- (aai.nil? || aai < aai_limit) ? nil : ani(target)
59
+ # Calculates and returns ANI against +targets+ if AAI >= +aai_limit+.
60
+ # Note that ANI values may be returned for lower (or failing) AAIs if the
61
+ # value is already stored in the database
62
+ def ani_after_aai(targets, aai_limit = 85.0)
63
+ # Run AAI and select targets with AAI ≥ aai_limit
64
+ aai = aai(targets)
65
+ sbj = aai.each_with_index.map { |i, k| targets[k] if i&.> aai_limit }
66
+ sbj.compact!
67
+
68
+ # Run ANI
69
+ ani(sbj) unless sbj.empty?
70
+
71
+ # Return ANI from the database
72
+ batch_values_from_db(:ani, targets.map { |i| i&.name })
69
73
  end
70
74
 
71
75
  ##
72
76
  # Execute an AAI command
73
- def aai_cmd(f1, f2, n1, n2, db, o = {})
77
+ def aairb_cmd(f1, f2, n1, n2, db, o = {})
74
78
  o = opts.merge(o)
75
- v = `aai.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" \
76
- --name1 "#{n1}" --name2 "#{n2}" \
77
- -t "#{o[:thr]}" -a --lookup-first "--#{o[:aai_save_rbm]}" \
78
- -p "#{o[:aai_p]}"`.chomp
79
- (v.nil? || v.empty?) ? 0 : v.to_f
79
+ run_cmd <<~CMD
80
+ aai.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" \
81
+ --name1 "#{n1}" --name2 "#{n2}" \
82
+ -t "#{o[:thr]}" -a --#{'no-' unless o[:aai_save_rbm]}save-rbm \
83
+ -p "#{o[:aai_p]}"
84
+ CMD
85
+ ensure
86
+ checkpoint(o[:checkpoint]) if o[:checkpoint]
80
87
  end
81
88
 
82
89
  ##
83
- # Execute an ANI command
84
- def ani_cmd(f1, f2, n1, n2, db, o = {})
85
- o = opts.merge(o)
86
- v = nil
87
- if o[:ani_p] == 'fastani'
88
- out = `fastANI -r "#{f1}" -q "#{f2}" \
89
- -o /dev/stdout 2>/dev/null`.chomp.split(/\s+/)
90
- unless out.empty?
91
- SQLite3::Database.new(db) do |conn|
92
- conn.execute 'insert into ani values(?, ?, ?, 0, ?, ?)',
93
- [n1, n2, out[2], out[3], out[4]]
94
- end
90
+ # Execute an ani.rb command
91
+ def anirb_cmd(targets)
92
+ f1 = tmp_file('largecontigs.fa')
93
+ return unless File.size?(f1)
94
+
95
+ targets.each do |target|
96
+ target_asm = target&.result(:assembly)&.file_path(:largecontigs) or next
97
+ run_cmd <<~CMD
98
+ ani.rb -1 "#{f1}" -2 "#{target_asm}" -S "#{tmp_dbs[:ani]}" \
99
+ --name1 "#{dataset.name}" --name2 "#{target.name}" \
100
+ -t "#{opts[:thr]}" -a --no-save-regions --no-save-rbm \
101
+ -p "#{opts[:ani_p]}"
102
+ CMD
103
+ checkpoint(:ani)
104
+ end
105
+ end
106
+
107
+ ##
108
+ # Execute a FastANI command
109
+ def fastani_cmd(targets)
110
+ f1 = tmp_file('largecontigs.fa')
111
+ return unless File.size?(f1)
112
+
113
+ # Run FastANI
114
+ File.open(f2 = tmp_file, 'w') do |fh|
115
+ targets.each do |target|
116
+ target_asm = target&.result(:assembly)&.file_path(:largecontigs)
117
+ fh.puts target_asm if target_asm
118
+ end
119
+ end
120
+ run_cmd <<~CMD
121
+ fastANI -q "#{f1}" --rl "#{f2}" -t #{opts[:thr]} \
122
+ -o "#{f3 = tmp_file}"
123
+ CMD
124
+
125
+ # Retrieve resulting data and save to DB
126
+ data = {}
127
+ File.open(f3, 'r') do |fh|
128
+ fh.each do |ln|
129
+ row = ln.chomp.split("\t")
130
+ n2 = File.basename(row[1], '.gz')
131
+ n2 = File.basename(n2, '.LargeContigs.fna')
132
+ data[n2] = [row[2].to_f, 0.0, row[3].to_i, row[4].to_i]
133
+ end
134
+ end
135
+ batch_data_to_db(:ani, data)
136
+
137
+ # Cleanup
138
+ [f2, f3].each { |i| File.unlink(i) }
139
+ end
140
+
141
+ ##
142
+ # Execute a FastAAI command
143
+ def fastaai_cmd(targets)
144
+ qry_idx = dataset.result(:essential_genes).file_path(:fastaai_index)
145
+ return nil unless qry_idx
146
+
147
+ # Run FastAAI
148
+ File.open(f1 = tmp_file, 'w') { |fh| fh.puts qry_idx }
149
+ File.open(f2 = tmp_file, 'w') do |fh|
150
+ targets.each do |target|
151
+ target_idx = target&.result(:essential_genes)&.file_path(:fastaai_index)
152
+ fh.puts target_idx if target_idx
153
+ end
154
+ end
155
+ run_cmd <<~CMD
156
+ FastAAI --qd "#{f1}" --rd "#{f2}" --output "#{f3 = tmp_file}" \
157
+ --threads #{opts[:thr]}
158
+ CMD
159
+
160
+ # Save values in the databases
161
+ haai_data = {}
162
+ aai_data = {}
163
+ File.open(f3, 'r') do |fh|
164
+ fh.each do |ln|
165
+ out = ln.chomp.split("\t")
166
+ haai_data[out[1]] = [
167
+ out[2].to_f * 100, out[3].to_f * 100, out[4].to_i, out[5].to_i
168
+ ]
169
+ aai_data[out[1]] = [out[6].to_f, 0, 0, 0] if out[6] !~ /^>/
95
170
  end
96
- v = out[2]
97
- else
98
- v = `ani.rb -1 "#{f1}" -2 "#{f2}" -S "#{db}" \
99
- --name1 "#{n1}" --name2 "#{n2}" \
100
- -t "#{opts[:thr]}" -a --no-save-regions --no-save-rbm \
101
- --lookup-first -p "#{o[:ani_p] || 'blast+'}"`.chomp
102
171
  end
103
- v.nil? || v.empty? ? 0 : v.to_f
172
+ batch_data_to_db(:haai, haai_data)
173
+ batch_data_to_db(:aai, aai_data)
174
+
175
+ # Cleanup
176
+ [f1, f2, f3].each { |i| File.unlink(i) }
177
+ end
178
+
179
+ ##
180
+ # Execute an hAAI command
181
+ def haai_cmd(targets)
182
+ aai_data = {}
183
+ targets.each do |target|
184
+ target_ess = target&.result(:essential_genes)&.file_path(:ess_genes)
185
+ next unless target_ess
186
+
187
+ # hAAI
188
+ h = aairb_cmd(
189
+ tmp_file('ess_genes.fa'), target_ess,
190
+ dataset.name, target.name, tmp_dbs[:haai],
191
+ aai_save_rbm: false, aai_p: opts[:haai_p], checkpoint: :haai
192
+ )&.chomp&.to_f
193
+ next if h.nil? || h.zero? || h > 90.0
194
+
195
+ # Estimated AAI
196
+ aai_data[target.name] = [
197
+ 100.0 - Math.exp(2.435076 + 0.4275193 * Math.log(100.0 - h)), 0, 0, 0
198
+ ] unless h&.zero? || h > 90.0
199
+ end
200
+ batch_data_to_db(:aai, aai_data)
201
+ end
202
+
203
+ def run_cmd(cmd)
204
+ puts "CMD: #{cmd}"
205
+ `#{cmd}`
104
206
  end
105
207
  end
@@ -22,12 +22,16 @@ module MiGA::DistanceRunner::Database
22
22
  end
23
23
  end
24
24
  # Initialize if it doesn't exist
25
- SQLite3::Database.new(dbs[m]) do |conn|
26
- conn.execute "create table if not exists #{t}(" +
27
- "seq1 varchar(256), seq2 varchar(256), " +
28
- "#{t} float, sd float, n int, omega int" +
29
- ")"
30
- end unless File.size? dbs[m]
25
+ unless File.size? dbs[m]
26
+ SQLite3::Database.new(dbs[m]) do |conn|
27
+ conn.execute <<~SQL
28
+ create table if not exists #{t}(
29
+ seq1 varchar(256), seq2 varchar(256),
30
+ #{t} float, sd float, n int, omega int
31
+ )
32
+ SQL
33
+ end
34
+ end
31
35
  # Copy over to (local) temporals
32
36
  @tmp_dbs[m] = tmp_file("#{m}.db")
33
37
  FileUtils.cp(dbs[m], tmp_dbs[m])
@@ -92,27 +96,72 @@ module MiGA::DistanceRunner::Database
92
96
  # possible number of matches
93
97
  def data_from_db(n1, n2, db, metric)
94
98
  y = nil
99
+ table = metric == :haai ? :aai : metric
95
100
  SQLite3::Database.new(db) do |conn|
96
101
  y = conn.execute(
97
- "select #{metric}, sd, n, omega from #{metric} where seq1=? and seq2=?",
102
+ "select #{table}, sd, n, omega from #{table} where seq1=? and seq2=?",
98
103
  [n1, n2]
99
104
  ).first
100
- end if File.size? db
105
+ end if File.size?(db)
101
106
  y
102
107
  end
103
108
 
104
109
  ##
105
110
  # Save +data+ of +metric+ between +n1+ and +n2+ in the +db+ database.
106
111
  def data_to_db(n1, n2, db, metric, data)
112
+ table = metric == :haai ? :aai : metric
107
113
  SQLite3::Database.new(db) do |conn|
108
114
  conn.execute(
109
- "insert into #{metric} (seq1, seq2, #{metric}, sd, n, omega) " +
115
+ "insert into #{table} (seq1, seq2, #{table}, sd, n, omega) " +
110
116
  "values (?, ?, ?, ?, ?, ?)", [n1, n2] + data
111
117
  )
112
118
  end
113
119
  checkpoint metric
114
120
  end
115
121
 
122
+ ##
123
+ # Saves +data+ of +metric+ in batch to the temporary database,
124
+ # and assumes query is +#dataset+. +data+ must be a hash with target names
125
+ # as key and arrays as values with: [val, sd, n, omega]
126
+ def batch_data_to_db(metric, data)
127
+ db = tmp_dbs[metric]
128
+ table = metric == :haai ? :aai : metric
129
+ `cp #{db} ~/here.db`
130
+ SQLite3::Database.new(db) do |conn|
131
+ data.each do |k, v|
132
+ sql = <<~SQL
133
+ insert into #{table} (
134
+ seq1, seq2, #{table}, sd, n, omega
135
+ ) values (?, ?, ?, ?, ?, ?)
136
+ SQL
137
+ conn.execute(sql, [dataset.name, k] + v)
138
+ end
139
+ end
140
+ checkpoint(metric)
141
+ end
142
+
143
+ ##
144
+ # Retrieves data of +metric+ in batch from the temporary database,
145
+ # and assumes query is +#dataset+. The output data is a hash with the same
146
+ # structure described for +#batch_data_to_db+
147
+ def batch_data_from_db(metric)
148
+ db = tmp_dbs[metric]
149
+ table = metric == :haai ? :aai : metric
150
+ data = {}
151
+ SQLite3::Database.new(db) do |conn|
152
+ sql = "select seq2, #{table}, sd, n, omega from #{table}"
153
+ conn.execute(sql).each { |row| data[row.shift] = row }
154
+ end
155
+ data
156
+ end
157
+
158
+ ##
159
+ # Retrieve only +metric+ values against +names+
160
+ def batch_values_from_db(metric, names)
161
+ data = batch_data_from_db(metric)
162
+ names.map { |i| data[i]&.first }
163
+ end
164
+
116
165
  ##
117
166
  # Iterates for each entry in +db+
118
167
  def foreach_in_db(db, metric, &blk)
@@ -120,4 +169,14 @@ module MiGA::DistanceRunner::Database
120
169
  conn.execute("select * from #{metric}").each { |r| blk[r] }
121
170
  end
122
171
  end
172
+
173
+ ##
174
+ # Select only those targets that are not yet stored in either direction
175
+ def pending_targets(targets, metric)
176
+ saved = batch_data_from_db(metric).keys
177
+ targets
178
+ .compact
179
+ .select { |i| !saved.include?(i.name) }
180
+ .select { |i| !stored_value(i, metric)&.> 0.0 }
181
+ end
123
182
  end
@@ -11,19 +11,14 @@ module MiGA::DistanceRunner::Pipeline
11
11
  val_med = ''
12
12
  val_cls = nil
13
13
  i_n = 0
14
- File.open(med, 'r') do |med_fh|
15
- med_fh.each_line do |med_ln|
16
- i_n += 1
17
- med_ln.chomp!
18
- val = send(metric, ref_project.dataset(med_ln))
19
- if !val.nil? and val >= max_val
20
- max_val = val
21
- val_med = med_ln
22
- val_cls = i_n
23
- puts "[#{classif}] New max: #{val_med} (#{val_cls}): #{max_val}"
24
- end
25
- end
26
- end
14
+ sbj_datasets = File.foreach(med).map { |i| ref_project.dataset(i.chomp) }
15
+ values = send(metric, sbj_datasets)
16
+ max_idx = values.map(&:to_f).each_with_index.max[1]
17
+ max_val = values[max_idx]
18
+ val_med = sbj_dataset[max_idx].name
19
+ val_cls = max_idx + 1
20
+ puts "[#{classif}] New max: #{val_med} (#{val_cls}): #{max_val}"
21
+
27
22
  classif = "#{classif}/miga-project.sc-#{val_cls}"
28
23
  result_fh.puts [val_cls, val_med, max_val, classif].join("\t")
29
24
  classify(clades, classif, metric, result_fh, val_cls)
@@ -32,9 +27,8 @@ module MiGA::DistanceRunner::Pipeline
32
27
  # Run distances against datasets listed in metadata's +:dist_req+
33
28
  def distances_by_request(metric)
34
29
  $stderr.puts 'Running distances by request'
35
- dataset.option(:dist_req).each do |target|
36
- ds = ref_project.dataset(target) and send(metric, ds)
37
- end
30
+ sbj_datasets = dataset.option(:dist_req).map { |i| ref_project.dataset(i) }
31
+ send(metric, sbj_datasets)
38
32
  end
39
33
 
40
34
  # Builds a tree with all visited medoids from any classification level
@@ -74,8 +68,10 @@ module MiGA::DistanceRunner::Pipeline
74
68
  $stderr.puts "Testing taxonomy | opts = #{opts}"
75
69
  # Get taxonomy of closest relative
76
70
  from_ref_project = (project != ref_project)
77
- res_dir = from_ref_project ?
78
- File.expand_path('data/09.distances/05.taxonomy', project.path) : home
71
+ res_dir =
72
+ from_ref_project ?
73
+ File.expand_path('data/09.distances/05.taxonomy', project.path) :
74
+ home
79
75
  Dir.mkdir res_dir unless Dir.exist? res_dir
80
76
  File.open(File.expand_path("#{dataset.name}.done", res_dir), 'w') do |fh|
81
77
  fh.puts Time.now.to_s