miga-base 0.4.1.0 → 0.4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/bin/miga +2 -244
  3. data/lib/miga/cli/action/about.rb +44 -0
  4. data/lib/miga/cli/action/add.rb +139 -0
  5. data/lib/miga/cli/action/add_result.rb +26 -0
  6. data/lib/miga/cli/action/console.rb +19 -0
  7. data/lib/miga/cli/action/daemon.rb +74 -0
  8. data/lib/miga/cli/action/date.rb +18 -0
  9. data/lib/miga/cli/action/doctor.rb +210 -0
  10. data/lib/miga/cli/action/edit.rb +24 -0
  11. data/lib/miga/cli/action/files.rb +31 -0
  12. data/lib/miga/cli/action/find.rb +48 -0
  13. data/lib/miga/cli/action/generic.rb +44 -0
  14. data/lib/miga/cli/action/get.rb +132 -0
  15. data/lib/miga/cli/action/init.rb +343 -0
  16. data/lib/miga/cli/action/ln.rb +42 -0
  17. data/lib/miga/cli/action/ls.rb +55 -0
  18. data/lib/miga/cli/action/ncbi_get.rb +218 -0
  19. data/lib/miga/cli/action/new.rb +45 -0
  20. data/lib/miga/cli/action/next_step.rb +27 -0
  21. data/lib/miga/cli/action/plugins.rb +28 -0
  22. data/lib/miga/cli/action/rm.rb +25 -0
  23. data/lib/miga/cli/action/run.rb +39 -0
  24. data/lib/miga/cli/action/stats.rb +140 -0
  25. data/lib/miga/cli/action/summary.rb +49 -0
  26. data/lib/miga/cli/action/tax_dist.rb +102 -0
  27. data/lib/miga/cli/action/tax_index.rb +47 -0
  28. data/lib/miga/cli/action/tax_set.rb +59 -0
  29. data/lib/miga/cli/action/tax_test.rb +77 -0
  30. data/lib/miga/cli/action.rb +66 -0
  31. data/lib/miga/cli/base.rb +90 -0
  32. data/lib/miga/cli.rb +426 -0
  33. data/lib/miga/project/result.rb +14 -6
  34. data/lib/miga/remote_dataset.rb +1 -1
  35. data/lib/miga/tax_index.rb +5 -4
  36. data/lib/miga/taxonomy/base.rb +63 -0
  37. data/lib/miga/taxonomy.rb +87 -92
  38. data/lib/miga/version.rb +6 -6
  39. data/test/taxonomy_test.rb +49 -9
  40. data/utils/distance/commands.rb +11 -11
  41. data/utils/distance/pipeline.rb +5 -5
  42. metadata +43 -49
  43. data/actions/about.rb +0 -43
  44. data/actions/add.rb +0 -129
  45. data/actions/add_result.rb +0 -30
  46. data/actions/daemon.rb +0 -55
  47. data/actions/date.rb +0 -14
  48. data/actions/doctor.rb +0 -201
  49. data/actions/edit.rb +0 -33
  50. data/actions/files.rb +0 -43
  51. data/actions/find.rb +0 -41
  52. data/actions/get.rb +0 -105
  53. data/actions/init.rb +0 -301
  54. data/actions/ln.rb +0 -47
  55. data/actions/ls.rb +0 -61
  56. data/actions/ncbi_get.rb +0 -192
  57. data/actions/new.rb +0 -44
  58. data/actions/next_step.rb +0 -33
  59. data/actions/plugins.rb +0 -25
  60. data/actions/rm.rb +0 -29
  61. data/actions/run.rb +0 -45
  62. data/actions/stats.rb +0 -149
  63. data/actions/summary.rb +0 -57
  64. data/actions/tax_dist.rb +0 -106
  65. data/actions/tax_index.rb +0 -46
  66. data/actions/tax_set.rb +0 -63
  67. data/actions/tax_test.rb +0 -80
@@ -0,0 +1,210 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+ require 'sqlite3'
6
+
7
+ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
8
+
9
+ def parse_cli
10
+ @@OPERATIONS.keys.each { |i| cli.defaults = {i => true} }
11
+ cli.parse do |opt|
12
+ operation_n = Hash[@@OPERATIONS.map{ |k,v| [v[0], k] }]
13
+ cli.opt_object(opt, [:project])
14
+ opt.on(
15
+ '--ignore TASK1,TASK2', Array,
16
+ 'Do not perform the task(s) listed. Available tasks are:',
17
+ * @@OPERATIONS.values.map{ |v| "~ #{v[0]}: #{v[1]}" }
18
+ ){ |v| v.map{ |i| cli[operation_n[i]] = false } }
19
+ opt.on(
20
+ '--only TASK',
21
+ 'Perform only the specified task (see --ignore)'
22
+ ) do |v|
23
+ op_k = @@OPERATIONS.find { |_, i| i[0] == v.downcase }.first
24
+ @@OPERATIONS.keys.each{ |i| cli[i] = false }
25
+ cli[op_k] = true
26
+ end
27
+ end
28
+ end
29
+
30
+ def check_sqlite3_database(db_file, metric)
31
+ begin
32
+ SQLite3::Database.new(db_file) do |conn|
33
+ conn.execute("select count(*) from #{metric}").first
34
+ end
35
+ rescue SQLite3::SQLException
36
+ yield
37
+ end
38
+ end
39
+
40
+ def perform
41
+ p = cli.load_project
42
+ @@OPERATIONS.keys.each do |k|
43
+ send("check_#{k}", cli) if cli[k]
44
+ end
45
+ end
46
+
47
+ @@OPERATIONS = {
48
+ db: ['databases', 'Check database files integrity'],
49
+ dist: ['distances', 'Check distance summary tables'],
50
+ files: ['files', 'Check for outdated files'],
51
+ ess: ['essential-genes', 'Check for unarchived essential genes'],
52
+ mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
53
+ start: ['start', 'Check for lingering .start files'],
54
+ tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
55
+ }
56
+ class << self
57
+ def OPERATIONS
58
+ @@OPERATIONS
59
+ end
60
+ end
61
+
62
+ def check_db(cli)
63
+ cli.say 'Checking databases integrity'
64
+ cli.load_project.each_dataset do |d|
65
+ [:distances, :taxonomy].each do |r_key|
66
+ r = d.result(r_key) or next
67
+ {haai_db: :aai, aai_db: :aai, ani_db: :ani}.each do |db_key, metric|
68
+ db_file = r.file_path(db_key) or next
69
+ check_sqlite3_database(db_file, metric) do
70
+ cli.say(
71
+ " > Removing #{db_key} #{r_key} table for #{d.name}")
72
+ [db_file, r.path(:done), r.path].each do |f|
73
+ File.unlink(f) if File.exist? f
74
+ end # each |f|
75
+ end # check_sqlite3_database
76
+ end # each |db_key, metric|
77
+ end # each |r_key|
78
+ end # each |d|
79
+ end
80
+
81
+ def check_dist(cli)
82
+ p = cli.load_project
83
+ [:ani, :aai].each do |dist|
84
+ res = p.result("#{dist}_distances")
85
+ next if res.nil?
86
+ cli.say "Checking #{dist} table for consistent datasets"
87
+ notok = {}
88
+ fix = {}
89
+ Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
90
+ lineno = 0
91
+ fh.each_line do |ln|
92
+ next if (lineno+=1)==1
93
+ r = ln.split("\t")
94
+ if [1,2].map{ |i| p.dataset(r[i]).nil? }.any?
95
+ [1,2].each do |i|
96
+ if p.dataset(r[i]).nil?
97
+ notok[r[i]] = true
98
+ else
99
+ fix[r[i]] = true
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
105
+
106
+ cli.say("- Fixing #{fix.size} datasets") unless fix.empty?
107
+ fix.keys.each do |d_n|
108
+ cli.say " > Fixing #{d_n}."
109
+ p.dataset(d_n).cleanup_distances!
110
+ end
111
+
112
+ unless notok.empty?
113
+ cli.say '- Unregistered datasets detected: '
114
+ if notok.size <= 5
115
+ notok.keys.each { |i| cli.say " > #{i}" }
116
+ else
117
+ cli.say " > #{notok.size}, including #{notok.keys.first}"
118
+ end
119
+ cli.say '- Removing tables, recompute'
120
+ res.remove!
121
+ end
122
+ end
123
+ end
124
+
125
+ def check_files(cli)
126
+ cli.say 'Looking for outdated files in results'
127
+ p = cli.load_project
128
+ p.each_dataset do |d|
129
+ d.each_result do |r_k, r|
130
+ ok = true
131
+ r.each_file do |_f_sym, _f_rel, f_abs|
132
+ unless File.exist? f_abs
133
+ ok = false
134
+ break
135
+ end
136
+ end
137
+ unless ok
138
+ cli.say " > Registering again #{d.name}:#{r_k}"
139
+ d.add_result(r_k, true, force: true)
140
+ end
141
+ end
142
+ end
143
+ end
144
+
145
+ def check_ess(cli)
146
+ cli.say 'Looking for unarchived essential genes'
147
+ cli.load_project.each_dataset do |d|
148
+ res = d.result(:essential_genes)
149
+ next if res.nil?
150
+ dir = res.file_path(:collection)
151
+ if dir.nil?
152
+ cli.say " > Removing #{d.name}:essential_genes"
153
+ res.remove!
154
+ next
155
+ end
156
+ unless Dir["#{dir}/*.faa"].empty?
157
+ cli.say " > Fixing #{d.name}"
158
+ cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
159
+ warn(cmdo) unless cmdo.empty?
160
+ end
161
+ end
162
+ end
163
+
164
+ def check_mts(cli)
165
+ cli.say 'Looking for unarchived MyTaxa Scan runs'
166
+ cli.load_project.each_dataset do |d|
167
+ res = d.result(:mytaxa_scan)
168
+ next if res.nil?
169
+ dir = res.file_path(:regions)
170
+ fix = false
171
+ unless dir.nil?
172
+ if Dir.exist? dir
173
+ cmdo = `cd '#{dir}/..' \
174
+ && tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
175
+ && rm -r '#{d.name}.reg'`.chomp
176
+ warn(cmdo) unless cmdo.empty?
177
+ end
178
+ fix = true
179
+ end
180
+ %w[blast mytaxain wintax gene_ids region_ids].each do |ext|
181
+ file = res.file_path(ext.to_sym)
182
+ unless file.nil?
183
+ FileUtils.rm(file) if File.exist? file
184
+ fix = true
185
+ end
186
+ end
187
+ if fix
188
+ cli.say " > Fixing #{d.name}"
189
+ d.add_result(:mytaxa_scan, true, force: true)
190
+ end
191
+ end
192
+ end
193
+
194
+ def check_start(cli)
195
+ cli.say 'Looking for legacy .start files lingering'
196
+ cli.load_project.each_dataset do |d|
197
+ d.each_result do |r_k, r|
198
+ if File.exist? r.path(:start)
199
+ cli.say " > Registering again #{d.name}:#{r_k}"
200
+ r.save
201
+ end
202
+ end
203
+ end
204
+ end
205
+
206
+ def check_tax(cli)
207
+ #cli.say 'o Checking for taxonomy/distances consistency'
208
+ # TODO: Find 95%ANI clusters with entries from different species
209
+ end
210
+ end
@@ -0,0 +1,24 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Edit < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.parse do |opt|
10
+ cli.opt_object(opt, [:project, :dataset_opt])
11
+ opt.on(
12
+ '-m', '--metadata STRING',
13
+ 'Metadata as key-value pairs separated by = and delimited by comma',
14
+ 'Values are saved as strings except for booleans (true / false) or nil'
15
+ ){ |v| cli[:metadata] = v }
16
+ end
17
+ end
18
+
19
+ def perform
20
+ obj = cli.load_project_or_dataset
21
+ cli.add_metadata(obj)
22
+ obj.save
23
+ end
24
+ end
@@ -0,0 +1,31 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Files < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.defaults = {details: false, json: true}
10
+ cli.parse do |opt|
11
+ cli.opt_object(opt, [:project, :dataset_opt])
12
+ opt.on(
13
+ '-i', '--info',
14
+ 'Print additional details for each file'
15
+ ){ |v| cli[:details] = v }
16
+ opt.on('--[no-]json',
17
+ 'Include (or not) JSON files containing results metadata',
18
+ 'JSON files are included by default'
19
+ ){ |v| cli[:json] = v }
20
+ end
21
+ end
22
+
23
+ def perform
24
+ cli.load_project_or_dataset.each_result do |sym, res|
25
+ cli.puts "#{ "#{sym}\tjson\t" if cli[:details] }#{res.path}" if cli[:json]
26
+ res.each_file do |k,f|
27
+ cli.puts "#{ "#{sym}\t#{k}\t" if cli[:details] }#{res.dir}/#{f}"
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,48 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Find < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.defaults = {add: false, ref: true}
10
+ cli.parse do |opt|
11
+ cli.opt_object(opt, [:project, :dataset_type])
12
+ opt.on(
13
+ '-a', '--add',
14
+ 'Register the datasets found',
15
+ 'By default, only lists them (dry run)'
16
+ ){ |v| cli[:add] = v }
17
+ opt.on(
18
+ '-q', '--query',
19
+ 'Register datasets as query'
20
+ ){ |v| cli[:ref] = !v }
21
+ opt.on(
22
+ '-u', '--user STRING',
23
+ 'Owner of the dataset.'
24
+ ){ |v| cli[:user] = v }
25
+ opt.on(
26
+ '-m', '--metadata STRING',
27
+ 'Metadata as key-value pairs separated by = and delimited by comma',
28
+ 'Values are saved as strings except for booleans (true / false) or nil'
29
+ ){ |v| cli[:metadata] = v }
30
+ end
31
+ end
32
+
33
+ def perform
34
+ p = cli.load_project
35
+ ud = p.unregistered_datasets
36
+ ud.each do |dn|
37
+ cli.puts dn
38
+ if cli[:add]
39
+ cli.say "Registering: #{dn}"
40
+ d = Dataset.new(p, dn, cli[:ref])
41
+ d = add_metadata(d)
42
+ p.add_dataset(dn)
43
+ res = d.first_preprocessing(true)
44
+ cli.say "- #{res}"
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,44 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Generic < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.opt_common = false
10
+ cli.parse do |opt|
11
+ descriptions = cli.class.TASK_DESC.keep_if { |k,v| k != :generic }
12
+ opt.separator MiGA::MiGA.tabulate(
13
+ [:action, :description], descriptions).join("\n")
14
+ opt.separator ''
15
+ opt.separator 'generic options:'
16
+ opt.on(
17
+ '-h', '--help',
18
+ 'Display this screen'
19
+ ){ puts opt ; exit }
20
+ opt.on(
21
+ '-v', '--version',
22
+ 'Show MiGA version'
23
+ ){ puts MiGA::MiGA.VERSION ; exit }
24
+ opt.on(
25
+ '-V', '--long-version',
26
+ 'Show complete MiGA version'
27
+ ){ |v| puts MiGA::MiGA.LONG_VERSION ; exit }
28
+ opt.on(
29
+ '-C', '--citation',
30
+ 'How to cite MiGA'
31
+ ) {|v| puts MiGA::MiGA.CITATION ; exit }
32
+ end
33
+ end
34
+
35
+ def perform
36
+ end
37
+
38
+ def complete
39
+ end
40
+
41
+ def name
42
+ '{action}'
43
+ end
44
+ end
@@ -0,0 +1,132 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+ require 'miga/remote_dataset'
6
+
7
+ class MiGA::Cli::Action::Get < MiGA::Cli::Action
8
+
9
+ def parse_cli
10
+ cli.defaults = {query: false, universe: :ncbi, db: :nuccore,
11
+ get_md: false, only_md: false}
12
+ cli.parse do |opt|
13
+ cli.opt_object(opt, [:project, :dataset, :dataset_type])
14
+ opt.on(
15
+ '-I', '--ids ID1,ID2,...', Array,
16
+ '(Mandatory unless -F) IDs in the remote database separated by commas'
17
+ ){ |v| cli[:ids] = v }
18
+ opt.on(
19
+ '-U', '--universe STRING',
20
+ "Universe of the remote database. By default: #{cli[:universe]}"
21
+ ){ |v| cli[:universe] = v.to_sym }
22
+ opt.on(
23
+ '--db STRING',
24
+ "Name of the remote database. By default: #{cli[:db]}"
25
+ ){ |v| cli[:db] = v.to_sym }
26
+ opt.on(
27
+ '-F', '--file PATH',
28
+ 'Tab-delimited file (with header) listing the datasets to download',
29
+ 'The long form of most options are supported as header (without --)',
30
+ 'including: dataset, ids, universe, db, metadata',
31
+ 'For flags without value (like query) use true/false',
32
+ 'Unsupported values are: project, file, verbose, help, and debug'
33
+ ){ |v| cli[:file] = v }
34
+ opt.on(
35
+ '-q', '--query',
36
+ 'Register the dataset as a query, not a reference dataset'
37
+ ){ |v| cli[:query] = v }
38
+ opt.on('--ignore-dup',
39
+ 'Ignore datasets that already exist'
40
+ ){ |v| cli[:ignore_dup] = v }
41
+ opt.on(
42
+ '-d', '--description STRING',
43
+ 'Description of the dataset'
44
+ ){ |v| cli[:description] = v }
45
+ opt.on(
46
+ '-c', '--comments STRING',
47
+ 'Comments on the dataset'
48
+ ){ |v| cli[:comments] = v }
49
+ opt.on(
50
+ '-m', '--metadata STRING',
51
+ 'Metadata as key-value pairs separated by = and delimited by comma',
52
+ 'Values are saved as strings except for booleans (true / false) or nil'
53
+ ){ |v| cli[:metadata] = v }
54
+ opt.on(
55
+ '--get-metadata',
56
+ 'Only download and update metadata for existing datasets'
57
+ ){ |v| cli[:get_md] = v }
58
+ opt.on(
59
+ '--only-metadata',
60
+ 'Create datasets without input data but retrieve all metadata'
61
+ ){ |v| cli[:only_md] = v }
62
+ opt.on(
63
+ '--api-key STRING',
64
+ 'API key for the given universe'
65
+ ){ |v| cli[:api_key] = v }
66
+ end
67
+ end
68
+
69
+ def perform
70
+ glob = [cli]
71
+ unless cli[:file].nil?
72
+ glob = []
73
+ File.open(cli[:file], 'r') do |fh|
74
+ h = nil
75
+ fh.each do |ln|
76
+ r = ln.chomp.split(/\t/)
77
+ if h.nil?
78
+ h = r
79
+ else
80
+ argv_i = [self.name]
81
+ h.each_with_index do |field, k|
82
+ case field.downcase
83
+ when *%w[query ignore-dup get-metadata only-metadata]
84
+ argv_i << "--#{field.downcase}" if r[k].downcase == 'true'
85
+ when *%w[project file verbose help debug]
86
+ raise "Unsupported header: #{field}"
87
+ else
88
+ argv_i += ["--#{field.downcase}", r[k]]
89
+ end
90
+ end
91
+ sub_cli = MiGA::Cli.new(argv_i)
92
+ sub_cli.defaults = cli.data
93
+ sub_cli.action.parse_cli
94
+ glob << sub_cli
95
+ end
96
+ end
97
+ end
98
+ end
99
+
100
+ p = cli.load_project
101
+ glob.each do |sub_cli|
102
+ sub_cli.ensure_par(dataset: '-D', ids: '-I')
103
+ unless sub_cli[:api_key].nil?
104
+ ENV["#{sub_cli[:universe].to_s.upcase}_API_KEY"] = sub_cli[:api_key]
105
+ end
106
+
107
+ sub_cli.say "Dataset: #{sub_cli[:dataset]}"
108
+ if sub_cli[:ignore_dup] && !sub_cli[:get_md]
109
+ next if Dataset.exist?(p, sub_cli[:dataset])
110
+ end
111
+
112
+ sub_cli.say 'Locating remote dataset'
113
+ rd = RemoteDataset.new(sub_cli[:ids], sub_cli[:db], sub_cli[:universe])
114
+
115
+ if sub_cli[:get_md]
116
+ sub_cli.say 'Updating dataset'
117
+ d = p.dataset(sub_cli[:dataset])
118
+ next if d.nil?
119
+ md = sub_cli.add_metadata(d).metadata.data
120
+ rd.update_metadata(d, md)
121
+ else
122
+ sub_cli.say 'Creating dataset'
123
+ dummy_d = Dataset.new(p, sub_cli[:dataset])
124
+ md = sub_cli.add_metadata(dummy_d).metadata.data
125
+ md[:metadata_only] = true if cli[:only_md]
126
+ dummy_d.remove!
127
+ rd.save_to(p, sub_cli[:dataset], !sub_cli[:query], md)
128
+ p.add_dataset(sub_cli[:dataset])
129
+ end
130
+ end
131
+ end
132
+ end