miga-base 1.2.14.2 → 1.2.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/miga/cli/action/doctor/base.rb +16 -40
- data/lib/miga/cli/action/doctor/databases.rb +39 -0
- data/lib/miga/cli/action/doctor/distances.rb +144 -0
- data/lib/miga/cli/action/doctor/operations.rb +159 -0
- data/lib/miga/cli/action/doctor.rb +7 -287
- data/lib/miga/cli/action/download/base.rb +64 -2
- data/lib/miga/cli/action/gtdb_get.rb +2 -31
- data/lib/miga/cli/action/ncbi_get.rb +6 -31
- data/lib/miga/cli/opt_helper.rb +1 -1
- data/lib/miga/common/errors.rb +10 -0
- data/lib/miga/dataset/base.rb +34 -5
- data/lib/miga/dataset/result/add.rb +286 -0
- data/lib/miga/dataset/result/ignore.rb +93 -0
- data/lib/miga/dataset/result.rb +31 -342
- data/lib/miga/remote_dataset/download.rb +6 -0
- data/lib/miga/version.rb +2 -2
- data/test/remote_dataset_test.rb +6 -0
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ad52d75f0a01d681043530335e20e89ecfb46822a8166dd7c0993b9822882f9
|
4
|
+
data.tar.gz: 28bf47b3e257718cb3071ebf48d2f0fd63a8797116f8108faa789643940aac3f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b498f1617e597feb77f6adc655f7fdd0ee941b670867f5157f12f5af49a462c712a72ea60a7b6ef28ad702a4afdcca0a33bac00216d306ef4bff2ddcee4f844b
|
7
|
+
data.tar.gz: 41755ac99346b7dbbdf1a971158f98c99ca970d4134b912b17989d4776ae528902fed6df68d7f477fecfcc02657fae0350f311e3195c2211bc4e0945744366c8
|
@@ -1,47 +1,7 @@
|
|
1
1
|
require 'miga/cli/action'
|
2
2
|
require 'miga/sqlite'
|
3
3
|
|
4
|
-
class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
|
5
|
-
end
|
6
|
-
|
7
4
|
module MiGA::Cli::Action::Doctor::Base
|
8
|
-
##
|
9
|
-
# Check the database in +db_file+ maintains integrity for the
|
10
|
-
# tables saving +metric+ (:ani or :aai) and call +blk+ if the
|
11
|
-
# file is corrupt or doesn't contain the expected structure
|
12
|
-
def check_sqlite3_database(db_file, metric, &blk)
|
13
|
-
MiGA::SQLite.new(db_file).run("select count(*) from #{metric}")
|
14
|
-
rescue SQLite3::SQLException, SQLite3::CorruptException
|
15
|
-
blk.call
|
16
|
-
end
|
17
|
-
|
18
|
-
def each_database_file(dataset, &blk)
|
19
|
-
ref_db = {
|
20
|
-
haai: ['01.haai', :aai], aai: ['02.aai', :aai], ani: ['03.ani', :ani]
|
21
|
-
}
|
22
|
-
qry_db = {
|
23
|
-
haai: ['.haai.db', :aai], aai: ['.aai.db', :aai], ani: ['.ani.db', :ani]
|
24
|
-
}
|
25
|
-
base = File.join(dataset.project.path, 'data', '09.distances')
|
26
|
-
result = :distances
|
27
|
-
if dataset.ref?
|
28
|
-
file_db = "#{dataset.name}.db"
|
29
|
-
ref_db.each do |rank, v|
|
30
|
-
dir, metric = *v
|
31
|
-
file = File.join(base, dir, file_db)
|
32
|
-
blk[file, metric, result, rank] if File.exist? file
|
33
|
-
end
|
34
|
-
# Query databases for reference databases refer to taxonomy runs
|
35
|
-
base = File.join(base, '05.taxonomy')
|
36
|
-
result = :taxonomy
|
37
|
-
end
|
38
|
-
qry_db.each do |rank, v|
|
39
|
-
ext, metric = *v
|
40
|
-
file = File.join(base, "#{dataset.name}#{ext}")
|
41
|
-
blk[file, metric, result, rank] if File.exist? file
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
5
|
##
|
46
6
|
# Scans the all-vs-all matrix registered in +res+ (MiGA::Result) in search of
|
47
7
|
# pairs where one or both datasets are missing or inactive in the project +p+
|
@@ -157,4 +117,20 @@ module MiGA::Cli::Action::Doctor::Base
|
|
157
117
|
end
|
158
118
|
end
|
159
119
|
end
|
120
|
+
|
121
|
+
##
|
122
|
+
# Run command +cmd+ with options +opts+
|
123
|
+
def run_cmd(cmd, opts = {})
|
124
|
+
opts = { return: :output, err2out: true, raise: false }.merge(opts)
|
125
|
+
cmdo = MiGA::MiGA.run_cmd(cmd, opts).chomp
|
126
|
+
warn(cmdo) unless cmdo.empty?
|
127
|
+
end
|
128
|
+
|
129
|
+
##
|
130
|
+
# Check if the essential genes result +res+ has an outdated FastAAI index
|
131
|
+
def outdated_fastaai_ess(res)
|
132
|
+
idx1 = res.file_path(:fastaai_index)
|
133
|
+
idx2 = res.file_path(:fastaai_index_2)
|
134
|
+
idx2.nil? && !idx1.nil?
|
135
|
+
end
|
160
136
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
|
2
|
+
module MiGA::Cli::Action::Doctor::Databases
|
3
|
+
##
|
4
|
+
# Check the database in +db_file+ maintains integrity for the
|
5
|
+
# tables saving +metric+ (:ani or :aai) and call +blk+ if the
|
6
|
+
# file is corrupt or doesn't contain the expected structure
|
7
|
+
def check_sqlite3_database(db_file, metric, &blk)
|
8
|
+
MiGA::SQLite.new(db_file).run("select count(*) from #{metric}")
|
9
|
+
rescue SQLite3::SQLException, SQLite3::CorruptException
|
10
|
+
blk.call
|
11
|
+
end
|
12
|
+
|
13
|
+
def each_database_file(dataset, &blk)
|
14
|
+
ref_db = {
|
15
|
+
haai: ['01.haai', :aai], aai: ['02.aai', :aai], ani: ['03.ani', :ani]
|
16
|
+
}
|
17
|
+
qry_db = {
|
18
|
+
haai: ['.haai.db', :aai], aai: ['.aai.db', :aai], ani: ['.ani.db', :ani]
|
19
|
+
}
|
20
|
+
base = File.join(dataset.project.path, 'data', '09.distances')
|
21
|
+
result = :distances
|
22
|
+
if dataset.ref?
|
23
|
+
file_db = "#{dataset.name}.db"
|
24
|
+
ref_db.each do |rank, v|
|
25
|
+
dir, metric = *v
|
26
|
+
file = File.join(base, dir, file_db)
|
27
|
+
blk[file, metric, result, rank] if File.exist? file
|
28
|
+
end
|
29
|
+
# Query databases for reference databases refer to taxonomy runs
|
30
|
+
base = File.join(base, '05.taxonomy')
|
31
|
+
result = :taxonomy
|
32
|
+
end
|
33
|
+
qry_db.each do |rank, v|
|
34
|
+
ext, metric = *v
|
35
|
+
file = File.join(base, "#{dataset.name}#{ext}")
|
36
|
+
blk[file, metric, result, rank] if File.exist? file
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
|
2
|
+
module MiGA::Cli::Action::Doctor::Distances
|
3
|
+
##
|
4
|
+
# Perform databases operation with MiGA::Cli +cli+
|
5
|
+
def check_db(cli)
|
6
|
+
cli.say 'Checking integrity of databases'
|
7
|
+
p = cli.load_project
|
8
|
+
n = p.dataset_names.size
|
9
|
+
(0 .. cli[:threads] - 1).map do |i|
|
10
|
+
Process.fork do
|
11
|
+
k = 0
|
12
|
+
p.each_dataset do |d|
|
13
|
+
k += 1
|
14
|
+
cli.advance('Datasets:', k, n, false) if i == 0
|
15
|
+
next unless k % cli[:threads] == i
|
16
|
+
each_database_file(d) do |db_file, metric, result, _rank|
|
17
|
+
check_sqlite3_database(db_file, metric) do
|
18
|
+
cli.say(
|
19
|
+
" > Removing malformed database from #{d.name}:#{result} "
|
20
|
+
)
|
21
|
+
File.unlink(db_file)
|
22
|
+
r = d.result(result) or next
|
23
|
+
[r.path(:done), r.path].each do |f|
|
24
|
+
File.unlink(f) if File.exist?(f)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
Process.waitall
|
32
|
+
cli.say
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# Perform bidirectional operation with MiGA::Cli +cli+
|
37
|
+
def check_bidir(cli)
|
38
|
+
cli.say 'Checking if reference distances are bidirectional'
|
39
|
+
project = cli.load_project
|
40
|
+
ref_ds = project.each_dataset.select(&:ref?)
|
41
|
+
|
42
|
+
# Read and merge data
|
43
|
+
tmp = partial_bidir_tmp(project, ref_ds)
|
44
|
+
dist = merge_bidir_tmp(tmp)
|
45
|
+
FileUtils.rm_rf(tmp)
|
46
|
+
|
47
|
+
# Write missing values (threaded)
|
48
|
+
MiGA::Parallel.distribute(ref_ds, cli[:threads]) do |ds, idx, thr|
|
49
|
+
cli.advance('Datasets:', idx + 1, ref_ds.size, false) if thr == 0
|
50
|
+
save_bidirectional(ds, dist)
|
51
|
+
end
|
52
|
+
cli.say
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Perform distances operation with MiGA::Cli +cli+
|
57
|
+
def check_dist(cli)
|
58
|
+
p = cli.load_project
|
59
|
+
%i[ani aai].each do |dist|
|
60
|
+
res = p.result("#{dist}_distances")
|
61
|
+
next if res.nil?
|
62
|
+
|
63
|
+
cli.say "Checking #{dist} table for consistent datasets"
|
64
|
+
notok, fix = check_dist_eval(cli, p, res)
|
65
|
+
check_dist_fix(cli, p, fix)
|
66
|
+
check_dist_recompute(cli, res, notok)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
#---- Auxuliary functions -----
|
71
|
+
|
72
|
+
##
|
73
|
+
# Make a temporal directory holding partial bidirectionality reports (one per thread)
|
74
|
+
# in a custom multi-JSON format. Requires a MiGA::Project +project+ and the iterator of
|
75
|
+
# the reference datasets +ref_ds+. Returns the path to the temporal directory created.
|
76
|
+
# Used by +check_bidir+
|
77
|
+
def partial_bidir_tmp(project, ref_ds)
|
78
|
+
n = ref_ds.size
|
79
|
+
|
80
|
+
# Read data first (threaded)
|
81
|
+
tmp = File.join(project.path, 'doctor-bidirectional.tmp')
|
82
|
+
FileUtils.mkdir_p(tmp)
|
83
|
+
MiGA::Parallel.process(cli[:threads]) do |thr|
|
84
|
+
file = File.join(tmp, "#{thr}.json")
|
85
|
+
fh = File.open(file, 'w')
|
86
|
+
[:aai, :ani].each do |metric|
|
87
|
+
fh.puts "# #{metric}"
|
88
|
+
ref_ds.each_with_index do |ds, idx|
|
89
|
+
if idx % cli[:threads] == thr
|
90
|
+
cli.advance('Reading:', idx + 1, n, false) if thr == 0
|
91
|
+
row = read_bidirectional(ds, metric)
|
92
|
+
fh.puts "#{ds.name} #{JSON.fast_generate(row)}" unless row.empty?
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
fh.puts '# end'
|
97
|
+
fh.flush # necessary for large threaded runs
|
98
|
+
fh.close
|
99
|
+
if thr == 0
|
100
|
+
cli.advance('Reading:', n, n, false)
|
101
|
+
cli.say
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
return tmp
|
106
|
+
end
|
107
|
+
|
108
|
+
##
|
109
|
+
# Read partial temporal reports of bidirectionality (located in +tmp+), and return
|
110
|
+
# a two-deep hash with the final missingness report by metric (first key) and
|
111
|
+
# dataset name (second key). Used by +check_bidir+
|
112
|
+
def merge_bidir_tmp(tmp)
|
113
|
+
dist = { aai: {}, ani: {} }
|
114
|
+
cli[:threads].times do |i|
|
115
|
+
cli.advance('Merging:', i + 1, cli[:threads], false)
|
116
|
+
file = File.join(tmp, "#{i}.json")
|
117
|
+
File.open(file, 'r') do |fh|
|
118
|
+
metric = nil
|
119
|
+
fh.each do |ln|
|
120
|
+
qry, row = ln.chomp.split(' ', 2)
|
121
|
+
row or raise "Unexpected format in #{file}:#{$.}"
|
122
|
+
if qry == '#'
|
123
|
+
metric = row.to_sym
|
124
|
+
else
|
125
|
+
raise "Unrecognized metric: #{metric}" unless dist[metric]
|
126
|
+
JSON.parse(row).each do |sbj, val|
|
127
|
+
dist[metric][qry] ||= {}
|
128
|
+
if dist[metric][sbj]&.include?(qry)
|
129
|
+
dist[metric][sbj].delete(qry) # Already bidirectional
|
130
|
+
else
|
131
|
+
dist[metric][qry][sbj] = val
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
raise "Incomplete thread dump: #{file}" unless metric == :end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
cli.say
|
140
|
+
|
141
|
+
return dist
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
@@ -0,0 +1,159 @@
|
|
1
|
+
|
2
|
+
module MiGA::Cli::Action::Doctor::Operations
|
3
|
+
##
|
4
|
+
# Perform status operation with MiGA::Cli +cli+
|
5
|
+
def check_status(cli)
|
6
|
+
cli.say 'Updating metadata status'
|
7
|
+
p = cli.load_project
|
8
|
+
n = p.dataset_names.size
|
9
|
+
(0 .. cli[:threads] - 1).map do |i|
|
10
|
+
Process.fork do
|
11
|
+
k = 0
|
12
|
+
cli.load_project.each_dataset do |d|
|
13
|
+
k += 1
|
14
|
+
cli.advance('Datasets:', k, n, false) if i == 0
|
15
|
+
d.recalculate_status if k % cli[:threads] == i
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
Process.waitall
|
20
|
+
cli.say
|
21
|
+
end
|
22
|
+
|
23
|
+
# check_db in Distances
|
24
|
+
|
25
|
+
# check_bidir in Distances
|
26
|
+
|
27
|
+
# check_dist in Distances
|
28
|
+
|
29
|
+
##
|
30
|
+
# Perform files operation with MiGA::Cli +cli+
|
31
|
+
def check_files(cli)
|
32
|
+
cli.say 'Looking for outdated files in results'
|
33
|
+
n, k = cli.load_project.dataset_names.size, 0
|
34
|
+
cli.load_project.each_dataset do |d|
|
35
|
+
cli.advance('Datasets:', k += 1, n, false)
|
36
|
+
d.each_result do |r_k, r|
|
37
|
+
ok = true
|
38
|
+
r.each_file do |_f_sym, _f_rel, f_abs|
|
39
|
+
unless File.exist? f_abs
|
40
|
+
ok = false
|
41
|
+
break
|
42
|
+
end
|
43
|
+
end
|
44
|
+
unless ok
|
45
|
+
cli.say " > Registering again #{d.name}:#{r_k} "
|
46
|
+
d.add_result(r_k, true, force: true)
|
47
|
+
sr = d.result(:stats) and sr.remove!
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
cli.say
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# Perform cds operation with MiGA::Cli +cli+
|
56
|
+
def check_cds(cli)
|
57
|
+
cli.say 'Looking for unzipped genes or proteins'
|
58
|
+
n, k = cli.load_project.dataset_names.size, 0
|
59
|
+
cli.load_project.each_dataset do |d|
|
60
|
+
cli.advance('Datasets:', k += 1, n, false)
|
61
|
+
res = d.result(:cds) or next
|
62
|
+
changed = false
|
63
|
+
%i[genes proteins gff3 gff2 tab].each do |f|
|
64
|
+
file = res.file_path(f) or next
|
65
|
+
if file !~ /\.gz/
|
66
|
+
cli.say " > Gzipping #{d.name} #{f} "
|
67
|
+
run_cmd(['gzip', '-9', file])
|
68
|
+
changed = true
|
69
|
+
end
|
70
|
+
end
|
71
|
+
if changed
|
72
|
+
d.add_result(:cds, true, force: true)
|
73
|
+
sr = d.result(:stats) and sr.remove!
|
74
|
+
end
|
75
|
+
end
|
76
|
+
cli.say
|
77
|
+
end
|
78
|
+
|
79
|
+
##
|
80
|
+
# Perform essential-genes operation with MiGA::Cli +cli+
|
81
|
+
def check_ess(cli)
|
82
|
+
cli.say 'Looking for outdated essential genes'
|
83
|
+
cli.load_project.each_dataset do |d|
|
84
|
+
res = d.result(:essential_genes)
|
85
|
+
next if res.nil?
|
86
|
+
|
87
|
+
dir = res.file_path(:collection)
|
88
|
+
if dir.nil? || outdated_fastaai_ess(res)
|
89
|
+
cli.say " > Removing #{d.name}:essential_genes"
|
90
|
+
res.remove!
|
91
|
+
d.result(:stats)&.remove!
|
92
|
+
next
|
93
|
+
end
|
94
|
+
next if Dir["#{dir}/*.faa"].empty?
|
95
|
+
|
96
|
+
cli.say " > Fixing #{d.name}"
|
97
|
+
run_cmd <<~CMD
|
98
|
+
cd #{dir.shellescape} && tar -zcf proteins.tar.gz *.faa && rm *.faa
|
99
|
+
CMD
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# Perform mytaxa-scan operation with MiGA::Cli +cli+
|
105
|
+
def check_mts(cli)
|
106
|
+
cli.say 'Looking for unarchived MyTaxa Scan runs'
|
107
|
+
cli.load_project.each_dataset do |d|
|
108
|
+
res = d.result(:mytaxa_scan)
|
109
|
+
next if res.nil?
|
110
|
+
|
111
|
+
dir = res.file_path(:regions)
|
112
|
+
fix = false
|
113
|
+
unless dir.nil?
|
114
|
+
if Dir.exist? dir
|
115
|
+
run_cmd <<~CMD
|
116
|
+
cd #{dir.shellescape}/.. \
|
117
|
+
&& tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
|
118
|
+
&& rm -r '#{d.name}.reg'
|
119
|
+
CMD
|
120
|
+
end
|
121
|
+
fix = true
|
122
|
+
end
|
123
|
+
%i[blast mytaxain wintax gene_ids region_ids].each do |ext|
|
124
|
+
file = res.file_path(ext)
|
125
|
+
unless file.nil?
|
126
|
+
FileUtils.rm(file) if File.exist? file
|
127
|
+
fix = true
|
128
|
+
end
|
129
|
+
end
|
130
|
+
if fix
|
131
|
+
cli.say " > Fixing #{d.name}"
|
132
|
+
d.add_result(:mytaxa_scan, true, force: true)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
##
|
138
|
+
# Perform start operation with MiGA::Cli +cli+
|
139
|
+
def check_start(cli)
|
140
|
+
cli.say 'Looking for legacy .start files lingering'
|
141
|
+
cli.load_project.each_dataset do |d|
|
142
|
+
d.each_result do |r_k, r|
|
143
|
+
if File.exist? r.path(:start)
|
144
|
+
cli.say " > Registering again #{d.name}:#{r_k}"
|
145
|
+
r.save
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
##
|
152
|
+
# Perform taxonomy operation with MiGA::Cli +cli+
|
153
|
+
def check_tax(cli)
|
154
|
+
# cli.say 'o Checking for taxonomy/distances consistency'
|
155
|
+
# TODO: Find 95%ANI clusters with entries from different species
|
156
|
+
# TODO: Find different 95%ANI clusters with genomes from the same species
|
157
|
+
# TODO: Find AAI values too high or too low for each LCA rank
|
158
|
+
end
|
159
|
+
end
|