miga-base 1.2.14.2 → 1.2.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor/base.rb +16 -40
- data/lib/miga/cli/action/doctor/databases.rb +39 -0
- data/lib/miga/cli/action/doctor/distances.rb +144 -0
- data/lib/miga/cli/action/doctor/operations.rb +159 -0
- data/lib/miga/cli/action/doctor.rb +7 -287
- data/lib/miga/cli/action/download/base.rb +64 -2
- data/lib/miga/cli/action/gtdb_get.rb +2 -31
- data/lib/miga/cli/action/ncbi_get.rb +6 -31
- data/lib/miga/cli/opt_helper.rb +1 -1
- data/lib/miga/common/errors.rb +10 -0
- data/lib/miga/dataset/base.rb +34 -5
- data/lib/miga/dataset/result/add.rb +286 -0
- data/lib/miga/dataset/result/ignore.rb +93 -0
- data/lib/miga/dataset/result.rb +31 -342
- data/lib/miga/remote_dataset/download.rb +6 -0
- data/lib/miga/version.rb +2 -2
- data/test/remote_dataset_test.rb +6 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ad52d75f0a01d681043530335e20e89ecfb46822a8166dd7c0993b9822882f9
|
4
|
+
data.tar.gz: 28bf47b3e257718cb3071ebf48d2f0fd63a8797116f8108faa789643940aac3f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b498f1617e597feb77f6adc655f7fdd0ee941b670867f5157f12f5af49a462c712a72ea60a7b6ef28ad702a4afdcca0a33bac00216d306ef4bff2ddcee4f844b
|
7
|
+
data.tar.gz: 41755ac99346b7dbbdf1a971158f98c99ca970d4134b912b17989d4776ae528902fed6df68d7f477fecfcc02657fae0350f311e3195c2211bc4e0945744366c8
|
@@ -1,47 +1,7 @@
|
|
1
1
|
require 'miga/cli/action'
|
2
2
|
require 'miga/sqlite'
|
3
3
|
|
4
|
-
class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
5
|
-
end
|
6
|
-
|
7
4
|
module MiGA::Cli::Action::Doctor::Base
|
8
|
-
##
|
9
|
-
# Check the database in +db_file+ maintains integrity for the
|
10
|
-
# tables saving +metric+ (:ani or :aai) and call +blk+ if the
|
11
|
-
# file is corrupt or doesn't contain the expected structure
|
12
|
-
def check_sqlite3_database(db_file, metric, &blk)
|
13
|
-
MiGA::SQLite.new(db_file).run("select count(*) from #{metric}")
|
14
|
-
rescue SQLite3::SQLException, SQLite3::CorruptException
|
15
|
-
blk.call
|
16
|
-
end
|
17
|
-
|
18
|
-
def each_database_file(dataset, &blk)
|
19
|
-
ref_db = {
|
20
|
-
haai: ['01.haai', :aai], aai: ['02.aai', :aai], ani: ['03.ani', :ani]
|
21
|
-
}
|
22
|
-
qry_db = {
|
23
|
-
haai: ['.haai.db', :aai], aai: ['.aai.db', :aai], ani: ['.ani.db', :ani]
|
24
|
-
}
|
25
|
-
base = File.join(dataset.project.path, 'data', '09.distances')
|
26
|
-
result = :distances
|
27
|
-
if dataset.ref?
|
28
|
-
file_db = "#{dataset.name}.db"
|
29
|
-
ref_db.each do |rank, v|
|
30
|
-
dir, metric = *v
|
31
|
-
file = File.join(base, dir, file_db)
|
32
|
-
blk[file, metric, result, rank] if File.exist? file
|
33
|
-
end
|
34
|
-
# Query databases for reference databases refer to taxonomy runs
|
35
|
-
base = File.join(base, '05.taxonomy')
|
36
|
-
result = :taxonomy
|
37
|
-
end
|
38
|
-
qry_db.each do |rank, v|
|
39
|
-
ext, metric = *v
|
40
|
-
file = File.join(base, "#{dataset.name}#{ext}")
|
41
|
-
blk[file, metric, result, rank] if File.exist? file
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
5
|
##
|
46
6
|
# Scans the all-vs-all matrix registered in +res+ (MiGA::Result) in search of
|
47
7
|
# pairs where one or both datasets are missing or inactive in the project +p+
|
@@ -157,4 +117,20 @@ module MiGA::Cli::Action::Doctor::Base
|
|
157
117
|
end
|
158
118
|
end
|
159
119
|
end
|
120
|
+
|
121
|
+
##
|
122
|
+
# Run command +cmd+ with options +opts+
|
123
|
+
def run_cmd(cmd, opts = {})
|
124
|
+
opts = { return: :output, err2out: true, raise: false }.merge(opts)
|
125
|
+
cmdo = MiGA::MiGA.run_cmd(cmd, opts).chomp
|
126
|
+
warn(cmdo) unless cmdo.empty?
|
127
|
+
end
|
128
|
+
|
129
|
+
##
|
130
|
+
# Check if the essential genes result +res+ has an outdated FastAAI index
|
131
|
+
def outdated_fastaai_ess(res)
|
132
|
+
idx1 = res.file_path(:fastaai_index)
|
133
|
+
idx2 = res.file_path(:fastaai_index_2)
|
134
|
+
idx2.nil? && !idx1.nil?
|
135
|
+
end
|
160
136
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
|
2
|
+
module MiGA::Cli::Action::Doctor::Databases
|
3
|
+
##
|
4
|
+
# Check the database in +db_file+ maintains integrity for the
|
5
|
+
# tables saving +metric+ (:ani or :aai) and call +blk+ if the
|
6
|
+
# file is corrupt or doesn't contain the expected structure
|
7
|
+
def check_sqlite3_database(db_file, metric, &blk)
|
8
|
+
MiGA::SQLite.new(db_file).run("select count(*) from #{metric}")
|
9
|
+
rescue SQLite3::SQLException, SQLite3::CorruptException
|
10
|
+
blk.call
|
11
|
+
end
|
12
|
+
|
13
|
+
def each_database_file(dataset, &blk)
|
14
|
+
ref_db = {
|
15
|
+
haai: ['01.haai', :aai], aai: ['02.aai', :aai], ani: ['03.ani', :ani]
|
16
|
+
}
|
17
|
+
qry_db = {
|
18
|
+
haai: ['.haai.db', :aai], aai: ['.aai.db', :aai], ani: ['.ani.db', :ani]
|
19
|
+
}
|
20
|
+
base = File.join(dataset.project.path, 'data', '09.distances')
|
21
|
+
result = :distances
|
22
|
+
if dataset.ref?
|
23
|
+
file_db = "#{dataset.name}.db"
|
24
|
+
ref_db.each do |rank, v|
|
25
|
+
dir, metric = *v
|
26
|
+
file = File.join(base, dir, file_db)
|
27
|
+
blk[file, metric, result, rank] if File.exist? file
|
28
|
+
end
|
29
|
+
# Query databases for reference databases refer to taxonomy runs
|
30
|
+
base = File.join(base, '05.taxonomy')
|
31
|
+
result = :taxonomy
|
32
|
+
end
|
33
|
+
qry_db.each do |rank, v|
|
34
|
+
ext, metric = *v
|
35
|
+
file = File.join(base, "#{dataset.name}#{ext}")
|
36
|
+
blk[file, metric, result, rank] if File.exist? file
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
|
2
|
+
module MiGA::Cli::Action::Doctor::Distances
|
3
|
+
##
|
4
|
+
# Perform databases operation with MiGA::Cli +cli+
|
5
|
+
def check_db(cli)
|
6
|
+
cli.say 'Checking integrity of databases'
|
7
|
+
p = cli.load_project
|
8
|
+
n = p.dataset_names.size
|
9
|
+
(0 .. cli[:threads] - 1).map do |i|
|
10
|
+
Process.fork do
|
11
|
+
k = 0
|
12
|
+
p.each_dataset do |d|
|
13
|
+
k += 1
|
14
|
+
cli.advance('Datasets:', k, n, false) if i == 0
|
15
|
+
next unless k % cli[:threads] == i
|
16
|
+
each_database_file(d) do |db_file, metric, result, _rank|
|
17
|
+
check_sqlite3_database(db_file, metric) do
|
18
|
+
cli.say(
|
19
|
+
" > Removing malformed database from #{d.name}:#{result} "
|
20
|
+
)
|
21
|
+
File.unlink(db_file)
|
22
|
+
r = d.result(result) or next
|
23
|
+
[r.path(:done), r.path].each do |f|
|
24
|
+
File.unlink(f) if File.exist?(f)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
Process.waitall
|
32
|
+
cli.say
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# Perform bidirectional operation with MiGA::Cli +cli+
|
37
|
+
def check_bidir(cli)
|
38
|
+
cli.say 'Checking if reference distances are bidirectional'
|
39
|
+
project = cli.load_project
|
40
|
+
ref_ds = project.each_dataset.select(&:ref?)
|
41
|
+
|
42
|
+
# Read and merge data
|
43
|
+
tmp = partial_bidir_tmp(project, ref_ds)
|
44
|
+
dist = merge_bidir_tmp(tmp)
|
45
|
+
FileUtils.rm_rf(tmp)
|
46
|
+
|
47
|
+
# Write missing values (threaded)
|
48
|
+
MiGA::Parallel.distribute(ref_ds, cli[:threads]) do |ds, idx, thr|
|
49
|
+
cli.advance('Datasets:', idx + 1, ref_ds.size, false) if thr == 0
|
50
|
+
save_bidirectional(ds, dist)
|
51
|
+
end
|
52
|
+
cli.say
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Perform distances operation with MiGA::Cli +cli+
|
57
|
+
def check_dist(cli)
|
58
|
+
p = cli.load_project
|
59
|
+
%i[ani aai].each do |dist|
|
60
|
+
res = p.result("#{dist}_distances")
|
61
|
+
next if res.nil?
|
62
|
+
|
63
|
+
cli.say "Checking #{dist} table for consistent datasets"
|
64
|
+
notok, fix = check_dist_eval(cli, p, res)
|
65
|
+
check_dist_fix(cli, p, fix)
|
66
|
+
check_dist_recompute(cli, res, notok)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
#---- Auxuliary functions -----
|
71
|
+
|
72
|
+
##
|
73
|
+
# Make a temporal directory holding partial bidirectionality reports (one per thread)
|
74
|
+
# in a custom multi-JSON format. Requires a MiGA::Project +project+ and the iterator of
|
75
|
+
# the reference datasets +ref_ds+. Returns the path to the temporal directory created.
|
76
|
+
# Used by +check_bidir+
|
77
|
+
def partial_bidir_tmp(project, ref_ds)
|
78
|
+
n = ref_ds.size
|
79
|
+
|
80
|
+
# Read data first (threaded)
|
81
|
+
tmp = File.join(project.path, 'doctor-bidirectional.tmp')
|
82
|
+
FileUtils.mkdir_p(tmp)
|
83
|
+
MiGA::Parallel.process(cli[:threads]) do |thr|
|
84
|
+
file = File.join(tmp, "#{thr}.json")
|
85
|
+
fh = File.open(file, 'w')
|
86
|
+
[:aai, :ani].each do |metric|
|
87
|
+
fh.puts "# #{metric}"
|
88
|
+
ref_ds.each_with_index do |ds, idx|
|
89
|
+
if idx % cli[:threads] == thr
|
90
|
+
cli.advance('Reading:', idx + 1, n, false) if thr == 0
|
91
|
+
row = read_bidirectional(ds, metric)
|
92
|
+
fh.puts "#{ds.name} #{JSON.fast_generate(row)}" unless row.empty?
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
fh.puts '# end'
|
97
|
+
fh.flush # necessary for large threaded runs
|
98
|
+
fh.close
|
99
|
+
if thr == 0
|
100
|
+
cli.advance('Reading:', n, n, false)
|
101
|
+
cli.say
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
return tmp
|
106
|
+
end
|
107
|
+
|
108
|
+
##
|
109
|
+
# Read partial temporal reports of bidirectionality (located in +tmp+), and return
|
110
|
+
# a two-deep hash with the final missingness report by metric (first key) and
|
111
|
+
# dataset name (second key). Used by +check_bidir+
|
112
|
+
def merge_bidir_tmp(tmp)
|
113
|
+
dist = { aai: {}, ani: {} }
|
114
|
+
cli[:threads].times do |i|
|
115
|
+
cli.advance('Merging:', i + 1, cli[:threads], false)
|
116
|
+
file = File.join(tmp, "#{i}.json")
|
117
|
+
File.open(file, 'r') do |fh|
|
118
|
+
metric = nil
|
119
|
+
fh.each do |ln|
|
120
|
+
qry, row = ln.chomp.split(' ', 2)
|
121
|
+
row or raise "Unexpected format in #{file}:#{$.}"
|
122
|
+
if qry == '#'
|
123
|
+
metric = row.to_sym
|
124
|
+
else
|
125
|
+
raise "Unrecognized metric: #{metric}" unless dist[metric]
|
126
|
+
JSON.parse(row).each do |sbj, val|
|
127
|
+
dist[metric][qry] ||= {}
|
128
|
+
if dist[metric][sbj]&.include?(qry)
|
129
|
+
dist[metric][sbj].delete(qry) # Already bidirectional
|
130
|
+
else
|
131
|
+
dist[metric][qry][sbj] = val
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
raise "Incomplete thread dump: #{file}" unless metric == :end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
cli.say
|
140
|
+
|
141
|
+
return dist
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
@@ -0,0 +1,159 @@
|
|
1
|
+
|
2
|
+
module MiGA::Cli::Action::Doctor::Operations
|
3
|
+
##
|
4
|
+
# Perform status operation with MiGA::Cli +cli+
|
5
|
+
def check_status(cli)
|
6
|
+
cli.say 'Updating metadata status'
|
7
|
+
p = cli.load_project
|
8
|
+
n = p.dataset_names.size
|
9
|
+
(0 .. cli[:threads] - 1).map do |i|
|
10
|
+
Process.fork do
|
11
|
+
k = 0
|
12
|
+
cli.load_project.each_dataset do |d|
|
13
|
+
k += 1
|
14
|
+
cli.advance('Datasets:', k, n, false) if i == 0
|
15
|
+
d.recalculate_status if k % cli[:threads] == i
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
Process.waitall
|
20
|
+
cli.say
|
21
|
+
end
|
22
|
+
|
23
|
+
# check_db in Distances
|
24
|
+
|
25
|
+
# check_bidir in Distances
|
26
|
+
|
27
|
+
# check_dist in Distances
|
28
|
+
|
29
|
+
##
|
30
|
+
# Perform files operation with MiGA::Cli +cli+
|
31
|
+
def check_files(cli)
|
32
|
+
cli.say 'Looking for outdated files in results'
|
33
|
+
n, k = cli.load_project.dataset_names.size, 0
|
34
|
+
cli.load_project.each_dataset do |d|
|
35
|
+
cli.advance('Datasets:', k += 1, n, false)
|
36
|
+
d.each_result do |r_k, r|
|
37
|
+
ok = true
|
38
|
+
r.each_file do |_f_sym, _f_rel, f_abs|
|
39
|
+
unless File.exist? f_abs
|
40
|
+
ok = false
|
41
|
+
break
|
42
|
+
end
|
43
|
+
end
|
44
|
+
unless ok
|
45
|
+
cli.say " > Registering again #{d.name}:#{r_k} "
|
46
|
+
d.add_result(r_k, true, force: true)
|
47
|
+
sr = d.result(:stats) and sr.remove!
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
cli.say
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# Perform cds operation with MiGA::Cli +cli+
|
56
|
+
def check_cds(cli)
|
57
|
+
cli.say 'Looking for unzipped genes or proteins'
|
58
|
+
n, k = cli.load_project.dataset_names.size, 0
|
59
|
+
cli.load_project.each_dataset do |d|
|
60
|
+
cli.advance('Datasets:', k += 1, n, false)
|
61
|
+
res = d.result(:cds) or next
|
62
|
+
changed = false
|
63
|
+
%i[genes proteins gff3 gff2 tab].each do |f|
|
64
|
+
file = res.file_path(f) or next
|
65
|
+
if file !~ /\.gz/
|
66
|
+
cli.say " > Gzipping #{d.name} #{f} "
|
67
|
+
run_cmd(['gzip', '-9', file])
|
68
|
+
changed = true
|
69
|
+
end
|
70
|
+
end
|
71
|
+
if changed
|
72
|
+
d.add_result(:cds, true, force: true)
|
73
|
+
sr = d.result(:stats) and sr.remove!
|
74
|
+
end
|
75
|
+
end
|
76
|
+
cli.say
|
77
|
+
end
|
78
|
+
|
79
|
+
##
|
80
|
+
# Perform essential-genes operation with MiGA::Cli +cli+
|
81
|
+
def check_ess(cli)
|
82
|
+
cli.say 'Looking for outdated essential genes'
|
83
|
+
cli.load_project.each_dataset do |d|
|
84
|
+
res = d.result(:essential_genes)
|
85
|
+
next if res.nil?
|
86
|
+
|
87
|
+
dir = res.file_path(:collection)
|
88
|
+
if dir.nil? || outdated_fastaai_ess(res)
|
89
|
+
cli.say " > Removing #{d.name}:essential_genes"
|
90
|
+
res.remove!
|
91
|
+
d.result(:stats)&.remove!
|
92
|
+
next
|
93
|
+
end
|
94
|
+
next if Dir["#{dir}/*.faa"].empty?
|
95
|
+
|
96
|
+
cli.say " > Fixing #{d.name}"
|
97
|
+
run_cmd <<~CMD
|
98
|
+
cd #{dir.shellescape} && tar -zcf proteins.tar.gz *.faa && rm *.faa
|
99
|
+
CMD
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# Perform mytaxa-scan operation with MiGA::Cli +cli+
|
105
|
+
def check_mts(cli)
|
106
|
+
cli.say 'Looking for unarchived MyTaxa Scan runs'
|
107
|
+
cli.load_project.each_dataset do |d|
|
108
|
+
res = d.result(:mytaxa_scan)
|
109
|
+
next if res.nil?
|
110
|
+
|
111
|
+
dir = res.file_path(:regions)
|
112
|
+
fix = false
|
113
|
+
unless dir.nil?
|
114
|
+
if Dir.exist? dir
|
115
|
+
run_cmd <<~CMD
|
116
|
+
cd #{dir.shellescape}/.. \
|
117
|
+
&& tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
|
118
|
+
&& rm -r '#{d.name}.reg'
|
119
|
+
CMD
|
120
|
+
end
|
121
|
+
fix = true
|
122
|
+
end
|
123
|
+
%i[blast mytaxain wintax gene_ids region_ids].each do |ext|
|
124
|
+
file = res.file_path(ext)
|
125
|
+
unless file.nil?
|
126
|
+
FileUtils.rm(file) if File.exist? file
|
127
|
+
fix = true
|
128
|
+
end
|
129
|
+
end
|
130
|
+
if fix
|
131
|
+
cli.say " > Fixing #{d.name}"
|
132
|
+
d.add_result(:mytaxa_scan, true, force: true)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
##
|
138
|
+
# Perform start operation with MiGA::Cli +cli+
|
139
|
+
def check_start(cli)
|
140
|
+
cli.say 'Looking for legacy .start files lingering'
|
141
|
+
cli.load_project.each_dataset do |d|
|
142
|
+
d.each_result do |r_k, r|
|
143
|
+
if File.exist? r.path(:start)
|
144
|
+
cli.say " > Registering again #{d.name}:#{r_k}"
|
145
|
+
r.save
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
##
|
152
|
+
# Perform taxonomy operation with MiGA::Cli +cli+
|
153
|
+
def check_tax(cli)
|
154
|
+
# cli.say 'o Checking for taxonomy/distances consistency'
|
155
|
+
# TODO: Find 95%ANI clusters with entries from different species
|
156
|
+
# TODO: Find different 95%ANI clusters with genomes from the same species
|
157
|
+
# TODO: Find AAI values too high or too low for each LCA rank
|
158
|
+
end
|
159
|
+
end
|