miga-base 0.4.1.0 → 0.4.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/bin/miga +2 -244
  3. data/lib/miga/cli/action/about.rb +44 -0
  4. data/lib/miga/cli/action/add.rb +139 -0
  5. data/lib/miga/cli/action/add_result.rb +26 -0
  6. data/lib/miga/cli/action/console.rb +19 -0
  7. data/lib/miga/cli/action/daemon.rb +74 -0
  8. data/lib/miga/cli/action/date.rb +18 -0
  9. data/lib/miga/cli/action/doctor.rb +210 -0
  10. data/lib/miga/cli/action/edit.rb +24 -0
  11. data/lib/miga/cli/action/files.rb +31 -0
  12. data/lib/miga/cli/action/find.rb +48 -0
  13. data/lib/miga/cli/action/generic.rb +44 -0
  14. data/lib/miga/cli/action/get.rb +132 -0
  15. data/lib/miga/cli/action/init.rb +343 -0
  16. data/lib/miga/cli/action/ln.rb +42 -0
  17. data/lib/miga/cli/action/ls.rb +55 -0
  18. data/lib/miga/cli/action/ncbi_get.rb +218 -0
  19. data/lib/miga/cli/action/new.rb +45 -0
  20. data/lib/miga/cli/action/next_step.rb +27 -0
  21. data/lib/miga/cli/action/plugins.rb +28 -0
  22. data/lib/miga/cli/action/rm.rb +25 -0
  23. data/lib/miga/cli/action/run.rb +39 -0
  24. data/lib/miga/cli/action/stats.rb +140 -0
  25. data/lib/miga/cli/action/summary.rb +49 -0
  26. data/lib/miga/cli/action/tax_dist.rb +102 -0
  27. data/lib/miga/cli/action/tax_index.rb +47 -0
  28. data/lib/miga/cli/action/tax_set.rb +59 -0
  29. data/lib/miga/cli/action/tax_test.rb +77 -0
  30. data/lib/miga/cli/action.rb +66 -0
  31. data/lib/miga/cli/base.rb +90 -0
  32. data/lib/miga/cli.rb +426 -0
  33. data/lib/miga/project/result.rb +14 -6
  34. data/lib/miga/remote_dataset.rb +1 -1
  35. data/lib/miga/tax_index.rb +5 -4
  36. data/lib/miga/taxonomy/base.rb +63 -0
  37. data/lib/miga/taxonomy.rb +87 -92
  38. data/lib/miga/version.rb +6 -6
  39. data/test/taxonomy_test.rb +49 -9
  40. data/utils/distance/commands.rb +11 -11
  41. data/utils/distance/pipeline.rb +5 -5
  42. metadata +43 -49
  43. data/actions/about.rb +0 -43
  44. data/actions/add.rb +0 -129
  45. data/actions/add_result.rb +0 -30
  46. data/actions/daemon.rb +0 -55
  47. data/actions/date.rb +0 -14
  48. data/actions/doctor.rb +0 -201
  49. data/actions/edit.rb +0 -33
  50. data/actions/files.rb +0 -43
  51. data/actions/find.rb +0 -41
  52. data/actions/get.rb +0 -105
  53. data/actions/init.rb +0 -301
  54. data/actions/ln.rb +0 -47
  55. data/actions/ls.rb +0 -61
  56. data/actions/ncbi_get.rb +0 -192
  57. data/actions/new.rb +0 -44
  58. data/actions/next_step.rb +0 -33
  59. data/actions/plugins.rb +0 -25
  60. data/actions/rm.rb +0 -29
  61. data/actions/run.rb +0 -45
  62. data/actions/stats.rb +0 -149
  63. data/actions/summary.rb +0 -57
  64. data/actions/tax_dist.rb +0 -106
  65. data/actions/tax_index.rb +0 -46
  66. data/actions/tax_set.rb +0 -63
  67. data/actions/tax_test.rb +0 -80
@@ -0,0 +1,210 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+ require 'sqlite3'
6
+
7
+ class MiGA::Cli::Action::Doctor < MiGA::Cli::Action
8
+
9
+ def parse_cli
10
+ @@OPERATIONS.keys.each { |i| cli.defaults = {i => true} }
11
+ cli.parse do |opt|
12
+ operation_n = Hash[@@OPERATIONS.map{ |k,v| [v[0], k] }]
13
+ cli.opt_object(opt, [:project])
14
+ opt.on(
15
+ '--ignore TASK1,TASK2', Array,
16
+ 'Do not perform the task(s) listed. Available tasks are:',
17
+ * @@OPERATIONS.values.map{ |v| "~ #{v[0]}: #{v[1]}" }
18
+ ){ |v| v.map{ |i| cli[operation_n[i]] = false } }
19
+ opt.on(
20
+ '--only TASK',
21
+ 'Perform only the specified task (see --ignore)'
22
+ ) do |v|
23
+ op_k = @@OPERATIONS.find { |_, i| i[0] == v.downcase }.first
24
+ @@OPERATIONS.keys.each{ |i| cli[i] = false }
25
+ cli[op_k] = true
26
+ end
27
+ end
28
+ end
29
+
30
+ def check_sqlite3_database(db_file, metric)
31
+ begin
32
+ SQLite3::Database.new(db_file) do |conn|
33
+ conn.execute("select count(*) from #{metric}").first
34
+ end
35
+ rescue SQLite3::SQLException
36
+ yield
37
+ end
38
+ end
39
+
40
+ def perform
41
+ p = cli.load_project
42
+ @@OPERATIONS.keys.each do |k|
43
+ send("check_#{k}", cli) if cli[k]
44
+ end
45
+ end
46
+
47
+ @@OPERATIONS = {
48
+ db: ['databases', 'Check database files integrity'],
49
+ dist: ['distances', 'Check distance summary tables'],
50
+ files: ['files', 'Check for outdated files'],
51
+ ess: ['essential-genes', 'Check for unarchived essential genes'],
52
+ mts: ['mytaxa-scan', 'Check for unarchived MyTaxa scan'],
53
+ start: ['start', 'Check for lingering .start files'],
54
+ tax: ['taxonomy', 'Check for taxonomy consistency (not yet implemented)']
55
+ }
56
+ class << self
57
+ def OPERATIONS
58
+ @@OPERATIONS
59
+ end
60
+ end
61
+
62
+ def check_db(cli)
63
+ cli.say 'Checking databases integrity'
64
+ cli.load_project.each_dataset do |d|
65
+ [:distances, :taxonomy].each do |r_key|
66
+ r = d.result(r_key) or next
67
+ {haai_db: :aai, aai_db: :aai, ani_db: :ani}.each do |db_key, metric|
68
+ db_file = r.file_path(db_key) or next
69
+ check_sqlite3_database(db_file, metric) do
70
+ cli.say(
71
+ " > Removing #{db_key} #{r_key} table for #{d.name}")
72
+ [db_file, r.path(:done), r.path].each do |f|
73
+ File.unlink(f) if File.exist? f
74
+ end # each |f|
75
+ end # check_sqlite3_database
76
+ end # each |db_key, metric|
77
+ end # each |r_key|
78
+ end # each |d|
79
+ end
80
+
81
+ def check_dist(cli)
82
+ p = cli.load_project
83
+ [:ani, :aai].each do |dist|
84
+ res = p.result("#{dist}_distances")
85
+ next if res.nil?
86
+ cli.say "Checking #{dist} table for consistent datasets"
87
+ notok = {}
88
+ fix = {}
89
+ Zlib::GzipReader.open(res.file_path(:matrix)) do |fh|
90
+ lineno = 0
91
+ fh.each_line do |ln|
92
+ next if (lineno+=1)==1
93
+ r = ln.split("\t")
94
+ if [1,2].map{ |i| p.dataset(r[i]).nil? }.any?
95
+ [1,2].each do |i|
96
+ if p.dataset(r[i]).nil?
97
+ notok[r[i]] = true
98
+ else
99
+ fix[r[i]] = true
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
105
+
106
+ cli.say("- Fixing #{fix.size} datasets") unless fix.empty?
107
+ fix.keys.each do |d_n|
108
+ cli.say " > Fixing #{d_n}."
109
+ p.dataset(d_n).cleanup_distances!
110
+ end
111
+
112
+ unless notok.empty?
113
+ cli.say '- Unregistered datasets detected: '
114
+ if notok.size <= 5
115
+ notok.keys.each { |i| cli.say " > #{i}" }
116
+ else
117
+ cli.say " > #{notok.size}, including #{notok.keys.first}"
118
+ end
119
+ cli.say '- Removing tables, recompute'
120
+ res.remove!
121
+ end
122
+ end
123
+ end
124
+
125
+ def check_files(cli)
126
+ cli.say 'Looking for outdated files in results'
127
+ p = cli.load_project
128
+ p.each_dataset do |d|
129
+ d.each_result do |r_k, r|
130
+ ok = true
131
+ r.each_file do |_f_sym, _f_rel, f_abs|
132
+ unless File.exist? f_abs
133
+ ok = false
134
+ break
135
+ end
136
+ end
137
+ unless ok
138
+ cli.say " > Registering again #{d.name}:#{r_k}"
139
+ d.add_result(r_k, true, force: true)
140
+ end
141
+ end
142
+ end
143
+ end
144
+
145
+ def check_ess(cli)
146
+ cli.say 'Looking for unarchived essential genes'
147
+ cli.load_project.each_dataset do |d|
148
+ res = d.result(:essential_genes)
149
+ next if res.nil?
150
+ dir = res.file_path(:collection)
151
+ if dir.nil?
152
+ cli.say " > Removing #{d.name}:essential_genes"
153
+ res.remove!
154
+ next
155
+ end
156
+ unless Dir["#{dir}/*.faa"].empty?
157
+ cli.say " > Fixing #{d.name}"
158
+ cmdo = `cd '#{dir}' && tar -zcf proteins.tar.gz *.faa && rm *.faa`.chomp
159
+ warn(cmdo) unless cmdo.empty?
160
+ end
161
+ end
162
+ end
163
+
164
+ def check_mts(cli)
165
+ cli.say 'Looking for unarchived MyTaxa Scan runs'
166
+ cli.load_project.each_dataset do |d|
167
+ res = d.result(:mytaxa_scan)
168
+ next if res.nil?
169
+ dir = res.file_path(:regions)
170
+ fix = false
171
+ unless dir.nil?
172
+ if Dir.exist? dir
173
+ cmdo = `cd '#{dir}/..' \
174
+ && tar -zcf '#{d.name}.reg.tar.gz' '#{d.name}.reg' \
175
+ && rm -r '#{d.name}.reg'`.chomp
176
+ warn(cmdo) unless cmdo.empty?
177
+ end
178
+ fix = true
179
+ end
180
+ %w[blast mytaxain wintax gene_ids region_ids].each do |ext|
181
+ file = res.file_path(ext.to_sym)
182
+ unless file.nil?
183
+ FileUtils.rm(file) if File.exist? file
184
+ fix = true
185
+ end
186
+ end
187
+ if fix
188
+ cli.say " > Fixing #{d.name}"
189
+ d.add_result(:mytaxa_scan, true, force: true)
190
+ end
191
+ end
192
+ end
193
+
194
+ def check_start(cli)
195
+ cli.say 'Looking for legacy .start files lingering'
196
+ cli.load_project.each_dataset do |d|
197
+ d.each_result do |r_k, r|
198
+ if File.exist? r.path(:start)
199
+ cli.say " > Registering again #{d.name}:#{r_k}"
200
+ r.save
201
+ end
202
+ end
203
+ end
204
+ end
205
+
206
+ def check_tax(cli)
207
+ #cli.say 'o Checking for taxonomy/distances consistency'
208
+ # TODO: Find 95%ANI clusters with entries from different species
209
+ end
210
+ end
@@ -0,0 +1,24 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Edit < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.parse do |opt|
10
+ cli.opt_object(opt, [:project, :dataset_opt])
11
+ opt.on(
12
+ '-m', '--metadata STRING',
13
+ 'Metadata as key-value pairs separated by = and delimited by comma',
14
+ 'Values are saved as strings except for booleans (true / false) or nil'
15
+ ){ |v| cli[:metadata] = v }
16
+ end
17
+ end
18
+
19
+ def perform
20
+ obj = cli.load_project_or_dataset
21
+ cli.add_metadata(obj)
22
+ obj.save
23
+ end
24
+ end
@@ -0,0 +1,31 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Files < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.defaults = {details: false, json: true}
10
+ cli.parse do |opt|
11
+ cli.opt_object(opt, [:project, :dataset_opt])
12
+ opt.on(
13
+ '-i', '--info',
14
+ 'Print additional details for each file'
15
+ ){ |v| cli[:details] = v }
16
+ opt.on('--[no-]json',
17
+ 'Include (or not) JSON files containing results metadata',
18
+ 'JSON files are included by default'
19
+ ){ |v| cli[:json] = v }
20
+ end
21
+ end
22
+
23
+ def perform
24
+ cli.load_project_or_dataset.each_result do |sym, res|
25
+ cli.puts "#{ "#{sym}\tjson\t" if cli[:details] }#{res.path}" if cli[:json]
26
+ res.each_file do |k,f|
27
+ cli.puts "#{ "#{sym}\t#{k}\t" if cli[:details] }#{res.dir}/#{f}"
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,48 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Find < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.defaults = {add: false, ref: true}
10
+ cli.parse do |opt|
11
+ cli.opt_object(opt, [:project, :dataset_type])
12
+ opt.on(
13
+ '-a', '--add',
14
+ 'Register the datasets found',
15
+ 'By default, only lists them (dry run)'
16
+ ){ |v| cli[:add] = v }
17
+ opt.on(
18
+ '-q', '--query',
19
+ 'Register datasets as query'
20
+ ){ |v| cli[:ref] = !v }
21
+ opt.on(
22
+ '-u', '--user STRING',
23
+ 'Owner of the dataset.'
24
+ ){ |v| cli[:user] = v }
25
+ opt.on(
26
+ '-m', '--metadata STRING',
27
+ 'Metadata as key-value pairs separated by = and delimited by comma',
28
+ 'Values are saved as strings except for booleans (true / false) or nil'
29
+ ){ |v| cli[:metadata] = v }
30
+ end
31
+ end
32
+
33
+ def perform
34
+ p = cli.load_project
35
+ ud = p.unregistered_datasets
36
+ ud.each do |dn|
37
+ cli.puts dn
38
+ if cli[:add]
39
+ cli.say "Registering: #{dn}"
40
+ d = Dataset.new(p, dn, cli[:ref])
41
+ d = add_metadata(d)
42
+ p.add_dataset(dn)
43
+ res = d.first_preprocessing(true)
44
+ cli.say "- #{res}"
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,44 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Generic < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.opt_common = false
10
+ cli.parse do |opt|
11
+ descriptions = cli.class.TASK_DESC.keep_if { |k,v| k != :generic }
12
+ opt.separator MiGA::MiGA.tabulate(
13
+ [:action, :description], descriptions).join("\n")
14
+ opt.separator ''
15
+ opt.separator 'generic options:'
16
+ opt.on(
17
+ '-h', '--help',
18
+ 'Display this screen'
19
+ ){ puts opt ; exit }
20
+ opt.on(
21
+ '-v', '--version',
22
+ 'Show MiGA version'
23
+ ){ puts MiGA::MiGA.VERSION ; exit }
24
+ opt.on(
25
+ '-V', '--long-version',
26
+ 'Show complete MiGA version'
27
+ ){ |v| puts MiGA::MiGA.LONG_VERSION ; exit }
28
+ opt.on(
29
+ '-C', '--citation',
30
+ 'How to cite MiGA'
31
+ ) {|v| puts MiGA::MiGA.CITATION ; exit }
32
+ end
33
+ end
34
+
35
+ def perform
36
+ end
37
+
38
+ def complete
39
+ end
40
+
41
+ def name
42
+ '{action}'
43
+ end
44
+ end
@@ -0,0 +1,132 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+ require 'miga/remote_dataset'
6
+
7
+ class MiGA::Cli::Action::Get < MiGA::Cli::Action
8
+
9
+ def parse_cli
10
+ cli.defaults = {query: false, universe: :ncbi, db: :nuccore,
11
+ get_md: false, only_md: false}
12
+ cli.parse do |opt|
13
+ cli.opt_object(opt, [:project, :dataset, :dataset_type])
14
+ opt.on(
15
+ '-I', '--ids ID1,ID2,...', Array,
16
+ '(Mandatory unless -F) IDs in the remote database separated by commas'
17
+ ){ |v| cli[:ids] = v }
18
+ opt.on(
19
+ '-U', '--universe STRING',
20
+ "Universe of the remote database. By default: #{cli[:universe]}"
21
+ ){ |v| cli[:universe] = v.to_sym }
22
+ opt.on(
23
+ '--db STRING',
24
+ "Name of the remote database. By default: #{cli[:db]}"
25
+ ){ |v| cli[:db] = v.to_sym }
26
+ opt.on(
27
+ '-F', '--file PATH',
28
+ 'Tab-delimited file (with header) listing the datasets to download',
29
+ 'The long form of most options are supported as header (without --)',
30
+ 'including: dataset, ids, universe, db, metadata',
31
+ 'For flags without value (like query) use true/false',
32
+ 'Unsupported values are: project, file, verbose, help, and debug'
33
+ ){ |v| cli[:file] = v }
34
+ opt.on(
35
+ '-q', '--query',
36
+ 'Register the dataset as a query, not a reference dataset'
37
+ ){ |v| cli[:query] = v }
38
+ opt.on('--ignore-dup',
39
+ 'Ignore datasets that already exist'
40
+ ){ |v| cli[:ignore_dup] = v }
41
+ opt.on(
42
+ '-d', '--description STRING',
43
+ 'Description of the dataset'
44
+ ){ |v| cli[:description] = v }
45
+ opt.on(
46
+ '-c', '--comments STRING',
47
+ 'Comments on the dataset'
48
+ ){ |v| cli[:comments] = v }
49
+ opt.on(
50
+ '-m', '--metadata STRING',
51
+ 'Metadata as key-value pairs separated by = and delimited by comma',
52
+ 'Values are saved as strings except for booleans (true / false) or nil'
53
+ ){ |v| cli[:metadata] = v }
54
+ opt.on(
55
+ '--get-metadata',
56
+ 'Only download and update metadata for existing datasets'
57
+ ){ |v| cli[:get_md] = v }
58
+ opt.on(
59
+ '--only-metadata',
60
+ 'Create datasets without input data but retrieve all metadata'
61
+ ){ |v| cli[:only_md] = v }
62
+ opt.on(
63
+ '--api-key STRING',
64
+ 'API key for the given universe'
65
+ ){ |v| cli[:api_key] = v }
66
+ end
67
+ end
68
+
69
+ def perform
70
+ glob = [cli]
71
+ unless cli[:file].nil?
72
+ glob = []
73
+ File.open(cli[:file], 'r') do |fh|
74
+ h = nil
75
+ fh.each do |ln|
76
+ r = ln.chomp.split(/\t/)
77
+ if h.nil?
78
+ h = r
79
+ else
80
+ argv_i = [self.name]
81
+ h.each_with_index do |field, k|
82
+ case field.downcase
83
+ when *%w[query ignore-dup get-metadata only-metadata]
84
+ argv_i << "--#{field.downcase}" if r[k].downcase == 'true'
85
+ when *%w[project file verbose help debug]
86
+ raise "Unsupported header: #{field}"
87
+ else
88
+ argv_i += ["--#{field.downcase}", r[k]]
89
+ end
90
+ end
91
+ sub_cli = MiGA::Cli.new(argv_i)
92
+ sub_cli.defaults = cli.data
93
+ sub_cli.action.parse_cli
94
+ glob << sub_cli
95
+ end
96
+ end
97
+ end
98
+ end
99
+
100
+ p = cli.load_project
101
+ glob.each do |sub_cli|
102
+ sub_cli.ensure_par(dataset: '-D', ids: '-I')
103
+ unless sub_cli[:api_key].nil?
104
+ ENV["#{sub_cli[:universe].to_s.upcase}_API_KEY"] = sub_cli[:api_key]
105
+ end
106
+
107
+ sub_cli.say "Dataset: #{sub_cli[:dataset]}"
108
+ if sub_cli[:ignore_dup] && !sub_cli[:get_md]
109
+ next if Dataset.exist?(p, sub_cli[:dataset])
110
+ end
111
+
112
+ sub_cli.say 'Locating remote dataset'
113
+ rd = RemoteDataset.new(sub_cli[:ids], sub_cli[:db], sub_cli[:universe])
114
+
115
+ if sub_cli[:get_md]
116
+ sub_cli.say 'Updating dataset'
117
+ d = p.dataset(sub_cli[:dataset])
118
+ next if d.nil?
119
+ md = sub_cli.add_metadata(d).metadata.data
120
+ rd.update_metadata(d, md)
121
+ else
122
+ sub_cli.say 'Creating dataset'
123
+ dummy_d = Dataset.new(p, sub_cli[:dataset])
124
+ md = sub_cli.add_metadata(dummy_d).metadata.data
125
+ md[:metadata_only] = true if cli[:only_md]
126
+ dummy_d.remove!
127
+ rd.save_to(p, sub_cli[:dataset], !sub_cli[:query], md)
128
+ p.add_dataset(sub_cli[:dataset])
129
+ end
130
+ end
131
+ end
132
+ end