miga-base 1.2.14.2 → 1.2.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b8a1b62d1125f54550d8cf801c46924abb79085005b8fa4abe732e9347b1fa82
4
- data.tar.gz: baa4a96d604fd649147f47aafeaea038310b8ec588edb8055f7d511842ba3f6b
3
+ metadata.gz: 0ad52d75f0a01d681043530335e20e89ecfb46822a8166dd7c0993b9822882f9
4
+ data.tar.gz: 28bf47b3e257718cb3071ebf48d2f0fd63a8797116f8108faa789643940aac3f
5
5
  SHA512:
6
- metadata.gz: 1ab3bed5edbd0e9ef513c731c9d28999bdb483547376446746157b58bddc92a9ddde7e0cdfd7766351f11e69f1e11df8e2ccf94815ab95108842caa43f6fdd29
7
- data.tar.gz: b3401886ca9139d3ed3a9c19188a1b171254c0d0c1c21d63beffb71b497713dac59ba5fb0176bf2340bc48595640dc6dadfba7881aa7c993cd82894b0679c8a6
6
+ metadata.gz: b498f1617e597feb77f6adc655f7fdd0ee941b670867f5157f12f5af49a462c712a72ea60a7b6ef28ad702a4afdcca0a33bac00216d306ef4bff2ddcee4f844b
7
+ data.tar.gz: 41755ac99346b7dbbdf1a971158f98c99ca970d4134b912b17989d4776ae528902fed6df68d7f477fecfcc02657fae0350f311e3195c2211bc4e0945744366c8
@@ -1,47 +1,7 @@
1
1
  require 'miga/cli/action'
2
2
  require 'miga/sqlite'
3
3
 
4
- class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
5
- end
6
-
7
4
  module MiGA::Cli::Action::Doctor::Base
8
- ##
9
- # Check the database in +db_file+ maintains integrity for the
10
- # tables saving +metric+ (:ani or :aai) and call +blk+ if the
11
- # file is corrupt or doesn't contain the expected structure
12
- def check_sqlite3_database(db_file, metric, &blk)
13
- MiGA::SQLite.new(db_file).run("select count(*) from #{metric}")
14
- rescue SQLite3::SQLException, SQLite3::CorruptException
15
- blk.call
16
- end
17
-
18
- def each_database_file(dataset, &blk)
19
- ref_db = {
20
- haai: ['01.haai', :aai], aai: ['02.aai', :aai], ani: ['03.ani', :ani]
21
- }
22
- qry_db = {
23
- haai: ['.haai.db', :aai], aai: ['.aai.db', :aai], ani: ['.ani.db', :ani]
24
- }
25
- base = File.join(dataset.project.path, 'data', '09.distances')
26
- result = :distances
27
- if dataset.ref?
28
- file_db = "#{dataset.name}.db"
29
- ref_db.each do |rank, v|
30
- dir, metric = *v
31
- file = File.join(base, dir, file_db)
32
- blk[file, metric, result, rank] if File.exist? file
33
- end
34
- # Query databases for reference databases refer to taxonomy runs
35
- base = File.join(base, '05.taxonomy')
36
- result = :taxonomy
37
- end
38
- qry_db.each do |rank, v|
39
- ext, metric = *v
40
- file = File.join(base, "#{dataset.name}#{ext}")
41
- blk[file, metric, result, rank] if File.exist? file
42
- end
43
- end
44
-
45
5
  ##
46
6
  # Scans the all-vs-all matrix registered in +res+ (MiGA::Result) in search of
47
7
  # pairs where one or both datasets are missing or inactive in the project +p+
@@ -157,4 +117,20 @@ module MiGA::Cli::Action::Doctor::Base
157
117
  end
158
118
  end
159
119
  end
120
+
121
+ ##
122
+ # Run command +cmd+ with options +opts+
123
+ def run_cmd(cmd, opts = {})
124
+ opts = { return: :output, err2out: true, raise: false }.merge(opts)
125
+ cmdo = MiGA::MiGA.run_cmd(cmd, opts).chomp
126
+ warn(cmdo) unless cmdo.empty?
127
+ end
128
+
129
+ ##
130
+ # Check if the essential genes result +res+ has an outdated FastAAI index
131
+ def outdated_fastaai_ess(res)
132
+ idx1 = res.file_path(:fastaai_index)
133
+ idx2 = res.file_path(:fastaai_index_2)
134
+ idx2.nil? && !idx1.nil?
135
+ end
160
136
  end
@@ -0,0 +1,39 @@
1
+
2
+ module MiGA::Cli::Action::Doctor::Databases
3
+ ##
4
+ # Check the database in +db_file+ maintains integrity for the
5
+ # tables saving +metric+ (:ani or :aai) and call +blk+ if the
6
+ # file is corrupt or doesn't contain the expected structure
7
+ def check_sqlite3_database(db_file, metric, &blk)
8
+ MiGA::SQLite.new(db_file).run("select count(*) from #{metric}")
9
+ rescue SQLite3::SQLException, SQLite3::CorruptException
10
+ blk.call
11
+ end
12
+
13
+ def each_database_file(dataset, &blk)
14
+ ref_db = {
15
+ haai: ['01.haai', :aai], aai: ['02.aai', :aai], ani: ['03.ani', :ani]
16
+ }
17
+ qry_db = {
18
+ haai: ['.haai.db', :aai], aai: ['.aai.db', :aai], ani: ['.ani.db', :ani]
19
+ }
20
+ base = File.join(dataset.project.path, 'data', '09.distances')
21
+ result = :distances
22
+ if dataset.ref?
23
+ file_db = "#{dataset.name}.db"
24
+ ref_db.each do |rank, v|
25
+ dir, metric = *v
26
+ file = File.join(base, dir, file_db)
27
+ blk[file, metric, result, rank] if File.exist? file
28
+ end
29
+ # Query databases for reference databases refer to taxonomy runs
30
+ base = File.join(base, '05.taxonomy')
31
+ result = :taxonomy
32
+ end
33
+ qry_db.each do |rank, v|
34
+ ext, metric = *v
35
+ file = File.join(base, "#{dataset.name}#{ext}")
36
+ blk[file, metric, result, rank] if File.exist? file
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,144 @@
1
+
2
+ module MiGA::Cli::Action::Doctor::Distances
3
+ ##
4
+ # Perform databases operation with MiGA::Cli +cli+
5
+ def check_db(cli)
6
+ cli.say 'Checking integrity of databases'
7
+ p = cli.load_project
8
+ n = p.dataset_names.size
9
+ (0 .. cli[:threads] - 1).map do |i|
10
+ Process.fork do
11
+ k = 0
12
+ p.each_dataset do |d|
13
+ k += 1
14
+ cli.advance('Datasets:', k, n, false) if i == 0
15
+ next unless k % cli[:threads] == i
16
+ each_database_file(d) do |db_file, metric, result, _rank|
17
+ check_sqlite3_database(db_file, metric) do
18
+ cli.say(
19
+ " > Removing malformed database from #{d.name}:#{result} "
20
+ )
21
+ File.unlink(db_file)
22
+ r = d.result(result) or next
23
+ [r.path(:done), r.path].each do |f|
24
+ File.unlink(f) if File.exist?(f)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ Process.waitall
32
+ cli.say
33
+ end
34
+
35
+ ##
36
+ # Perform bidirectional operation with MiGA::Cli +cli+
37
+ def check_bidir(cli)
38
+ cli.say 'Checking if reference distances are bidirectional'
39
+ project = cli.load_project
40
+ ref_ds = project.each_dataset.select(&:ref?)
41
+
42
+ # Read and merge data
43
+ tmp = partial_bidir_tmp(project, ref_ds)
44
+ dist = merge_bidir_tmp(tmp)
45
+ FileUtils.rm_rf(tmp)
46
+
47
+ # Write missing values (threaded)
48
+ MiGA::Parallel.distribute(ref_ds, cli[:threads]) do |ds, idx, thr|
49
+ cli.advance('Datasets:', idx + 1, ref_ds.size, false) if thr == 0
50
+ save_bidirectional(ds, dist)
51
+ end
52
+ cli.say
53
+ end
54
+
55
+ ##
56
+ # Perform distances operation with MiGA::Cli +cli+
57
+ def check_dist(cli)
58
+ p = cli.load_project
59
+ %i[ani aai].each do |dist|
60
+ res = p.result("#{dist}_distances")
61
+ next if res.nil?
62
+
63
+ cli.say "Checking #{dist} table for consistent datasets"
64
+ notok, fix = check_dist_eval(cli, p, res)
65
+ check_dist_fix(cli, p, fix)
66
+ check_dist_recompute(cli, res, notok)
67
+ end
68
+ end
69
+
70
+ #---- Auxuliary functions -----
71
+
72
+ ##
73
+ # Make a temporal directory holding partial bidirectionality reports (one per thread)
74
+ # in a custom multi-JSON format. Requires a MiGA::Project +project+ and the iterator of
75
+ # the reference datasets +ref_ds+. Returns the path to the temporal directory created.
76
+ # Used by +check_bidir+
77
+ def partial_bidir_tmp(project, ref_ds)
78
+ n = ref_ds.size
79
+
80
+ # Read data first (threaded)
81
+ tmp = File.join(project.path, 'doctor-bidirectional.tmp')
82
+ FileUtils.mkdir_p(tmp)
83
+ MiGA::Parallel.process(cli[:threads]) do |thr|
84
+ file = File.join(tmp, "#{thr}.json")
85
+ fh = File.open(file, 'w')
86
+ [:aai, :ani].each do |metric|
87
+ fh.puts "# #{metric}"
88
+ ref_ds.each_with_index do |ds, idx|
89
+ if idx % cli[:threads] == thr
90
+ cli.advance('Reading:', idx + 1, n, false) if thr == 0
91
+ row = read_bidirectional(ds, metric)
92
+ fh.puts "#{ds.name} #{JSON.fast_generate(row)}" unless row.empty?
93
+ end
94
+ end
95
+ end
96
+ fh.puts '# end'
97
+ fh.flush # necessary for large threaded runs
98
+ fh.close
99
+ if thr == 0
100
+ cli.advance('Reading:', n, n, false)
101
+ cli.say
102
+ end
103
+ end
104
+
105
+ return tmp
106
+ end
107
+
108
+ ##
109
+ # Read partial temporal reports of bidirectionality (located in +tmp+), and return
110
+ # a two-deep hash with the final missingness report by metric (first key) and
111
+ # dataset name (second key). Used by +check_bidir+
112
+ def merge_bidir_tmp(tmp)
113
+ dist = { aai: {}, ani: {} }
114
+ cli[:threads].times do |i|
115
+ cli.advance('Merging:', i + 1, cli[:threads], false)
116
+ file = File.join(tmp, "#{i}.json")
117
+ File.open(file, 'r') do |fh|
118
+ metric = nil
119
+ fh.each do |ln|
120
+ qry, row = ln.chomp.split(' ', 2)
121
+ row or raise "Unexpected format in #{file}:#{$.}"
122
+ if qry == '#'
123
+ metric = row.to_sym
124
+ else
125
+ raise "Unrecognized metric: #{metric}" unless dist[metric]
126
+ JSON.parse(row).each do |sbj, val|
127
+ dist[metric][qry] ||= {}
128
+ if dist[metric][sbj]&.include?(qry)
129
+ dist[metric][sbj].delete(qry) # Already bidirectional
130
+ else
131
+ dist[metric][qry][sbj] = val
132
+ end
133
+ end
134
+ end
135
+ end
136
+ raise "Incomplete thread dump: #{file}" unless metric == :end
137
+ end
138
+ end
139
+ cli.say
140
+
141
+ return dist
142
+ end
143
+ end
144
+
@@ -0,0 +1,159 @@
1
+
2
+ module MiGA::Cli::Action::Doctor::Operations
3
+ ##
4
+ # Perform status operation with MiGA::Cli +cli+
5
+ def check_status(cli)
6
+ cli.say 'Updating metadata status'
7
+ p = cli.load_project
8
+ n = p.dataset_names.size
9
+ (0 .. cli[:threads] - 1).map do |i|
10
+ Process.fork do
11
+ k = 0
12
+ cli.load_project.each_dataset do |d|
13
+ k += 1
14
+ cli.advance('Datasets:', k, n, false) if i == 0
15
+ d.recalculate_status if k % cli[:threads] == i
16
+ end
17
+ end
18
+ end
19
+ Process.waitall
20
+ cli.say
21
+ end
22
+
23
+ # check_db in Distances
24
+
25
+ # check_bidir in Distances
26
+
27
+ # check_dist in Distances
28
+
29
+ ##
30
+ # Perform files operation with MiGA::Cli +cli+
31
+ def check_files(cli)
32
+ cli.say 'Looking for outdated files in results'
33
+ n, k = cli.load_project.dataset_names.size, 0
34
+ cli.load_project.each_dataset do |d|
35
+ cli.advance('Datasets:', k += 1, n, false)
36
+ d.each_result do |r_k, r|
37
+ ok = true
38
+ r.each_file do |_f_sym, _f_rel, f_abs|
39
+ unless File.exist? f_abs
40
+ ok = false
41
+ break
42
+ end
43
+ end
44
+ unless ok
45
+ cli.say " > Registering again #{d.name}:#{r_k} "
46
+ d.add_result(r_k, true, force: true)
47
+ sr = d.result(:stats) and sr.remove!
48
+ end
49
+ end
50
+ end
51
+ cli.say
52
+ end
53
+
54
+ ##
55
+ # Perform cds operation with MiGA::Cli +cli+
56
+ def check_cds(cli)
57
+ cli.say 'Looking for unzipped genes or proteins'
58
+ n, k = cli.load_project.dataset_names.size, 0
59
+ cli.load_project.each_dataset do |d|
60
+ cli.advance('Datasets:', k += 1, n, false)
61
+ res = d.result(:cds) or next
62
+ changed = false
63
+ %i[genes proteins gff3 gff2 tab].each do |f|
64
+ file = res.file_path(f) or next
65
+ if file !~ /\.gz/
66
+ cli.say " > Gzipping #{d.name} #{f} "
67
+ run_cmd(['gzip', '-9', file])
68
+ changed = true
69
+ end
70
+ end
71
+ if changed
72
+ d.add_result(:cds, true, force: true)
73
+ sr = d.result(:stats) and sr.remove!
74
+ end
75
+ end
76
+ cli.say
77
+ end
78
+
79
+ ##
80
+ # Perform essential-genes operation with MiGA::Cli +cli+
81
+ def check_ess(cli)
82
+ cli.say 'Looking for outdated essential genes'
83
+ cli.load_project.each_dataset do |d|
84
+ res = d.result(:essential_genes)
85
+ next if res.nil?
86
+
87
+ dir = res.file_path(:collection)
88
+ if dir.nil? || outdated_fastaai_ess(res)
89
+ cli.say " > Removing #{d.name}:essential_genes"
90
+ res.remove!
91
+ d.result(:stats)&.remove!
92
+ next
93
+ end
94
+ next if Dir["#{dir}/*.faa"].empty?
95
+
96
+ cli.say " > Fixing #{d.name}"
97
+ run_cmd <<~CMD
98
+ cd #{dir.shellescape} && tar -zcf proteins.tar.gz *.faa && rm *.faa
99
+ CMD
100
+ end
101
+ end
102
+
103
+ ##
104
+ # Perform mytaxa-scan operation with MiGA::Cli +cli+
105
+ def check_mts(cli)
106
+ cli.say 'Looking for unarchived MyTaxa Scan runs'
107
+ cli.load_project.each_dataset do |d|
108
+ res = d.result(:mytaxa_scan)
109
+ next if res.nil?
110
+
111
+ dir = res.file_path(:regions)
112
+ fix = false
113
+ unless dir.nil?
114
+ if Dir.exist? dir
115
+ run_cmd <<~CMD
116
+ cd #{dir.shellescape}/.. \
117
+ && tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
118
+ && rm -r '#{d.name}.reg'
119
+ CMD
120
+ end
121
+ fix = true
122
+ end
123
+ %i[blast mytaxain wintax gene_ids region_ids].each do |ext|
124
+ file = res.file_path(ext)
125
+ unless file.nil?
126
+ FileUtils.rm(file) if File.exist? file
127
+ fix = true
128
+ end
129
+ end
130
+ if fix
131
+ cli.say " > Fixing #{d.name}"
132
+ d.add_result(:mytaxa_scan, true, force: true)
133
+ end
134
+ end
135
+ end
136
+
137
+ ##
138
+ # Perform start operation with MiGA::Cli +cli+
139
+ def check_start(cli)
140
+ cli.say 'Looking for legacy .start files lingering'
141
+ cli.load_project.each_dataset do |d|
142
+ d.each_result do |r_k, r|
143
+ if File.exist? r.path(:start)
144
+ cli.say " > Registering again #{d.name}:#{r_k}"
145
+ r.save
146
+ end
147
+ end
148
+ end
149
+ end
150
+
151
+ ##
152
+ # Perform taxonomy operation with MiGA::Cli +cli+
153
+ def check_tax(cli)
154
+ # cli.say 'o Checking for taxonomy/distances consistency'
155
+ # TODO: Find 95%ANI clusters with entries from different species
156
+ # TODO: Find different 95%ANI clusters with genomes from the same species
157
+ # TODO: Find AAI values too high or too low for each LCA rank
158
+ end
159
+ end