miga-base 1.2.15.0 → 1.2.15.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9de962d747353644dddff05181f74c87a3f401df14e81d1810f125c9d1843351
4
- data.tar.gz: 3dd90251f07976c36adfdabaa603e399bc046f68c14ff83680eb6a6de4219159
3
+ metadata.gz: 0ad52d75f0a01d681043530335e20e89ecfb46822a8166dd7c0993b9822882f9
4
+ data.tar.gz: 28bf47b3e257718cb3071ebf48d2f0fd63a8797116f8108faa789643940aac3f
5
5
  SHA512:
6
- metadata.gz: 68f3c9616c1d2ad0fed1f03fc6b1054d428fa8b61999febcf6394d34c261550e203b108d2399a43077d95980b6c623e40799e0bf233cfcc130ff5ef97fdc4e47
7
- data.tar.gz: e95d549756ad10f86f4abf1761fb763e8ee218ff81d1cf8f6b597c831227482b6e90871800eb38863a8a12ed622b3222385b5f730e3e4103de06aa24753e8a3a
6
+ metadata.gz: b498f1617e597feb77f6adc655f7fdd0ee941b670867f5157f12f5af49a462c712a72ea60a7b6ef28ad702a4afdcca0a33bac00216d306ef4bff2ddcee4f844b
7
+ data.tar.gz: 41755ac99346b7dbbdf1a971158f98c99ca970d4134b912b17989d4776ae528902fed6df68d7f477fecfcc02657fae0350f311e3195c2211bc4e0945744366c8
@@ -1,47 +1,7 @@
1
1
  require 'miga/cli/action'
2
2
  require 'miga/sqlite'
3
3
 
4
- class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
5
- end
6
-
7
4
  module MiGA::Cli::Action::Doctor::Base
8
- ##
9
- # Check the database in +db_file+ maintains integrity for the
10
- # tables saving +metric+ (:ani or :aai) and call +blk+ if the
11
- # file is corrupt or doesn't contain the expected structure
12
- def check_sqlite3_database(db_file, metric, &blk)
13
- MiGA::SQLite.new(db_file).run("select count(*) from #{metric}")
14
- rescue SQLite3::SQLException, SQLite3::CorruptException
15
- blk.call
16
- end
17
-
18
- def each_database_file(dataset, &blk)
19
- ref_db = {
20
- haai: ['01.haai', :aai], aai: ['02.aai', :aai], ani: ['03.ani', :ani]
21
- }
22
- qry_db = {
23
- haai: ['.haai.db', :aai], aai: ['.aai.db', :aai], ani: ['.ani.db', :ani]
24
- }
25
- base = File.join(dataset.project.path, 'data', '09.distances')
26
- result = :distances
27
- if dataset.ref?
28
- file_db = "#{dataset.name}.db"
29
- ref_db.each do |rank, v|
30
- dir, metric = *v
31
- file = File.join(base, dir, file_db)
32
- blk[file, metric, result, rank] if File.exist? file
33
- end
34
- # Query databases for reference databases refer to taxonomy runs
35
- base = File.join(base, '05.taxonomy')
36
- result = :taxonomy
37
- end
38
- qry_db.each do |rank, v|
39
- ext, metric = *v
40
- file = File.join(base, "#{dataset.name}#{ext}")
41
- blk[file, metric, result, rank] if File.exist? file
42
- end
43
- end
44
-
45
5
  ##
46
6
  # Scans the all-vs-all matrix registered in +res+ (MiGA::Result) in search of
47
7
  # pairs where one or both datasets are missing or inactive in the project +p+
@@ -157,4 +117,20 @@ module MiGA::Cli::Action::Doctor::Base
157
117
  end
158
118
  end
159
119
  end
120
+
121
+ ##
122
+ # Run command +cmd+ with options +opts+
123
+ def run_cmd(cmd, opts = {})
124
+ opts = { return: :output, err2out: true, raise: false }.merge(opts)
125
+ cmdo = MiGA::MiGA.run_cmd(cmd, opts).chomp
126
+ warn(cmdo) unless cmdo.empty?
127
+ end
128
+
129
+ ##
130
+ # Check if the essential genes result +res+ has an outdated FastAAI index
131
+ def outdated_fastaai_ess(res)
132
+ idx1 = res.file_path(:fastaai_index)
133
+ idx2 = res.file_path(:fastaai_index_2)
134
+ idx2.nil? && !idx1.nil?
135
+ end
160
136
  end
@@ -0,0 +1,39 @@
1
+
2
+ module MiGA::Cli::Action::Doctor::Databases
3
+ ##
4
+ # Check the database in +db_file+ maintains integrity for the
5
+ # tables saving +metric+ (:ani or :aai) and call +blk+ if the
6
+ # file is corrupt or doesn't contain the expected structure
7
+ def check_sqlite3_database(db_file, metric, &blk)
8
+ MiGA::SQLite.new(db_file).run("select count(*) from #{metric}")
9
+ rescue SQLite3::SQLException, SQLite3::CorruptException
10
+ blk.call
11
+ end
12
+
13
+ def each_database_file(dataset, &blk)
14
+ ref_db = {
15
+ haai: ['01.haai', :aai], aai: ['02.aai', :aai], ani: ['03.ani', :ani]
16
+ }
17
+ qry_db = {
18
+ haai: ['.haai.db', :aai], aai: ['.aai.db', :aai], ani: ['.ani.db', :ani]
19
+ }
20
+ base = File.join(dataset.project.path, 'data', '09.distances')
21
+ result = :distances
22
+ if dataset.ref?
23
+ file_db = "#{dataset.name}.db"
24
+ ref_db.each do |rank, v|
25
+ dir, metric = *v
26
+ file = File.join(base, dir, file_db)
27
+ blk[file, metric, result, rank] if File.exist? file
28
+ end
29
+ # Query databases for reference databases refer to taxonomy runs
30
+ base = File.join(base, '05.taxonomy')
31
+ result = :taxonomy
32
+ end
33
+ qry_db.each do |rank, v|
34
+ ext, metric = *v
35
+ file = File.join(base, "#{dataset.name}#{ext}")
36
+ blk[file, metric, result, rank] if File.exist? file
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,144 @@
1
+
2
+ module MiGA::Cli::Action::Doctor::Distances
3
+ ##
4
+ # Perform databases operation with MiGA::Cli +cli+
5
+ def check_db(cli)
6
+ cli.say 'Checking integrity of databases'
7
+ p = cli.load_project
8
+ n = p.dataset_names.size
9
+ (0 .. cli[:threads] - 1).map do |i|
10
+ Process.fork do
11
+ k = 0
12
+ p.each_dataset do |d|
13
+ k += 1
14
+ cli.advance('Datasets:', k, n, false) if i == 0
15
+ next unless k % cli[:threads] == i
16
+ each_database_file(d) do |db_file, metric, result, _rank|
17
+ check_sqlite3_database(db_file, metric) do
18
+ cli.say(
19
+ " > Removing malformed database from #{d.name}:#{result} "
20
+ )
21
+ File.unlink(db_file)
22
+ r = d.result(result) or next
23
+ [r.path(:done), r.path].each do |f|
24
+ File.unlink(f) if File.exist?(f)
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ Process.waitall
32
+ cli.say
33
+ end
34
+
35
+ ##
36
+ # Perform bidirectional operation with MiGA::Cli +cli+
37
+ def check_bidir(cli)
38
+ cli.say 'Checking if reference distances are bidirectional'
39
+ project = cli.load_project
40
+ ref_ds = project.each_dataset.select(&:ref?)
41
+
42
+ # Read and merge data
43
+ tmp = partial_bidir_tmp(project, ref_ds)
44
+ dist = merge_bidir_tmp(tmp)
45
+ FileUtils.rm_rf(tmp)
46
+
47
+ # Write missing values (threaded)
48
+ MiGA::Parallel.distribute(ref_ds, cli[:threads]) do |ds, idx, thr|
49
+ cli.advance('Datasets:', idx + 1, ref_ds.size, false) if thr == 0
50
+ save_bidirectional(ds, dist)
51
+ end
52
+ cli.say
53
+ end
54
+
55
+ ##
56
+ # Perform distances operation with MiGA::Cli +cli+
57
+ def check_dist(cli)
58
+ p = cli.load_project
59
+ %i[ani aai].each do |dist|
60
+ res = p.result("#{dist}_distances")
61
+ next if res.nil?
62
+
63
+ cli.say "Checking #{dist} table for consistent datasets"
64
+ notok, fix = check_dist_eval(cli, p, res)
65
+ check_dist_fix(cli, p, fix)
66
+ check_dist_recompute(cli, res, notok)
67
+ end
68
+ end
69
+
70
+ #---- Auxuliary functions -----
71
+
72
+ ##
73
+ # Make a temporal directory holding partial bidirectionality reports (one per thread)
74
+ # in a custom multi-JSON format. Requires a MiGA::Project +project+ and the iterator of
75
+ # the reference datasets +ref_ds+. Returns the path to the temporal directory created.
76
+ # Used by +check_bidir+
77
+ def partial_bidir_tmp(project, ref_ds)
78
+ n = ref_ds.size
79
+
80
+ # Read data first (threaded)
81
+ tmp = File.join(project.path, 'doctor-bidirectional.tmp')
82
+ FileUtils.mkdir_p(tmp)
83
+ MiGA::Parallel.process(cli[:threads]) do |thr|
84
+ file = File.join(tmp, "#{thr}.json")
85
+ fh = File.open(file, 'w')
86
+ [:aai, :ani].each do |metric|
87
+ fh.puts "# #{metric}"
88
+ ref_ds.each_with_index do |ds, idx|
89
+ if idx % cli[:threads] == thr
90
+ cli.advance('Reading:', idx + 1, n, false) if thr == 0
91
+ row = read_bidirectional(ds, metric)
92
+ fh.puts "#{ds.name} #{JSON.fast_generate(row)}" unless row.empty?
93
+ end
94
+ end
95
+ end
96
+ fh.puts '# end'
97
+ fh.flush # necessary for large threaded runs
98
+ fh.close
99
+ if thr == 0
100
+ cli.advance('Reading:', n, n, false)
101
+ cli.say
102
+ end
103
+ end
104
+
105
+ return tmp
106
+ end
107
+
108
+ ##
109
+ # Read partial temporal reports of bidirectionality (located in +tmp+), and return
110
+ # a two-deep hash with the final missingness report by metric (first key) and
111
+ # dataset name (second key). Used by +check_bidir+
112
+ def merge_bidir_tmp(tmp)
113
+ dist = { aai: {}, ani: {} }
114
+ cli[:threads].times do |i|
115
+ cli.advance('Merging:', i + 1, cli[:threads], false)
116
+ file = File.join(tmp, "#{i}.json")
117
+ File.open(file, 'r') do |fh|
118
+ metric = nil
119
+ fh.each do |ln|
120
+ qry, row = ln.chomp.split(' ', 2)
121
+ row or raise "Unexpected format in #{file}:#{$.}"
122
+ if qry == '#'
123
+ metric = row.to_sym
124
+ else
125
+ raise "Unrecognized metric: #{metric}" unless dist[metric]
126
+ JSON.parse(row).each do |sbj, val|
127
+ dist[metric][qry] ||= {}
128
+ if dist[metric][sbj]&.include?(qry)
129
+ dist[metric][sbj].delete(qry) # Already bidirectional
130
+ else
131
+ dist[metric][qry][sbj] = val
132
+ end
133
+ end
134
+ end
135
+ end
136
+ raise "Incomplete thread dump: #{file}" unless metric == :end
137
+ end
138
+ end
139
+ cli.say
140
+
141
+ return dist
142
+ end
143
+ end
144
+
@@ -0,0 +1,159 @@
1
+
2
+ module MiGA::Cli::Action::Doctor::Operations
3
+ ##
4
+ # Perform status operation with MiGA::Cli +cli+
5
+ def check_status(cli)
6
+ cli.say 'Updating metadata status'
7
+ p = cli.load_project
8
+ n = p.dataset_names.size
9
+ (0 .. cli[:threads] - 1).map do |i|
10
+ Process.fork do
11
+ k = 0
12
+ cli.load_project.each_dataset do |d|
13
+ k += 1
14
+ cli.advance('Datasets:', k, n, false) if i == 0
15
+ d.recalculate_status if k % cli[:threads] == i
16
+ end
17
+ end
18
+ end
19
+ Process.waitall
20
+ cli.say
21
+ end
22
+
23
+ # check_db in Distances
24
+
25
+ # check_bidir in Distances
26
+
27
+ # check_dist in Distances
28
+
29
+ ##
30
+ # Perform files operation with MiGA::Cli +cli+
31
+ def check_files(cli)
32
+ cli.say 'Looking for outdated files in results'
33
+ n, k = cli.load_project.dataset_names.size, 0
34
+ cli.load_project.each_dataset do |d|
35
+ cli.advance('Datasets:', k += 1, n, false)
36
+ d.each_result do |r_k, r|
37
+ ok = true
38
+ r.each_file do |_f_sym, _f_rel, f_abs|
39
+ unless File.exist? f_abs
40
+ ok = false
41
+ break
42
+ end
43
+ end
44
+ unless ok
45
+ cli.say " > Registering again #{d.name}:#{r_k} "
46
+ d.add_result(r_k, true, force: true)
47
+ sr = d.result(:stats) and sr.remove!
48
+ end
49
+ end
50
+ end
51
+ cli.say
52
+ end
53
+
54
+ ##
55
+ # Perform cds operation with MiGA::Cli +cli+
56
+ def check_cds(cli)
57
+ cli.say 'Looking for unzipped genes or proteins'
58
+ n, k = cli.load_project.dataset_names.size, 0
59
+ cli.load_project.each_dataset do |d|
60
+ cli.advance('Datasets:', k += 1, n, false)
61
+ res = d.result(:cds) or next
62
+ changed = false
63
+ %i[genes proteins gff3 gff2 tab].each do |f|
64
+ file = res.file_path(f) or next
65
+ if file !~ /\.gz/
66
+ cli.say " > Gzipping #{d.name} #{f} "
67
+ run_cmd(['gzip', '-9', file])
68
+ changed = true
69
+ end
70
+ end
71
+ if changed
72
+ d.add_result(:cds, true, force: true)
73
+ sr = d.result(:stats) and sr.remove!
74
+ end
75
+ end
76
+ cli.say
77
+ end
78
+
79
+ ##
80
+ # Perform essential-genes operation with MiGA::Cli +cli+
81
+ def check_ess(cli)
82
+ cli.say 'Looking for outdated essential genes'
83
+ cli.load_project.each_dataset do |d|
84
+ res = d.result(:essential_genes)
85
+ next if res.nil?
86
+
87
+ dir = res.file_path(:collection)
88
+ if dir.nil? || outdated_fastaai_ess(res)
89
+ cli.say " > Removing #{d.name}:essential_genes"
90
+ res.remove!
91
+ d.result(:stats)&.remove!
92
+ next
93
+ end
94
+ next if Dir["#{dir}/*.faa"].empty?
95
+
96
+ cli.say " > Fixing #{d.name}"
97
+ run_cmd <<~CMD
98
+ cd #{dir.shellescape} && tar -zcf proteins.tar.gz *.faa && rm *.faa
99
+ CMD
100
+ end
101
+ end
102
+
103
+ ##
104
+ # Perform mytaxa-scan operation with MiGA::Cli +cli+
105
+ def check_mts(cli)
106
+ cli.say 'Looking for unarchived MyTaxa Scan runs'
107
+ cli.load_project.each_dataset do |d|
108
+ res = d.result(:mytaxa_scan)
109
+ next if res.nil?
110
+
111
+ dir = res.file_path(:regions)
112
+ fix = false
113
+ unless dir.nil?
114
+ if Dir.exist? dir
115
+ run_cmd <<~CMD
116
+ cd #{dir.shellescape}/.. \
117
+ && tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
118
+ && rm -r '#{d.name}.reg'
119
+ CMD
120
+ end
121
+ fix = true
122
+ end
123
+ %i[blast mytaxain wintax gene_ids region_ids].each do |ext|
124
+ file = res.file_path(ext)
125
+ unless file.nil?
126
+ FileUtils.rm(file) if File.exist? file
127
+ fix = true
128
+ end
129
+ end
130
+ if fix
131
+ cli.say " > Fixing #{d.name}"
132
+ d.add_result(:mytaxa_scan, true, force: true)
133
+ end
134
+ end
135
+ end
136
+
137
+ ##
138
+ # Perform start operation with MiGA::Cli +cli+
139
+ def check_start(cli)
140
+ cli.say 'Looking for legacy .start files lingering'
141
+ cli.load_project.each_dataset do |d|
142
+ d.each_result do |r_k, r|
143
+ if File.exist? r.path(:start)
144
+ cli.say " > Registering again #{d.name}:#{r_k}"
145
+ r.save
146
+ end
147
+ end
148
+ end
149
+ end
150
+
151
+ ##
152
+ # Perform taxonomy operation with MiGA::Cli +cli+
153
+ def check_tax(cli)
154
+ # cli.say 'o Checking for taxonomy/distances consistency'
155
+ # TODO: Find 95%ANI clusters with entries from different species
156
+ # TODO: Find different 95%ANI clusters with genomes from the same species
157
+ # TODO: Find AAI values too high or too low for each LCA rank
158
+ end
159
+ end