miga-base 0.4.1.0 → 0.4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/bin/miga +2 -244
  3. data/lib/miga/cli/action/about.rb +44 -0
  4. data/lib/miga/cli/action/add.rb +139 -0
  5. data/lib/miga/cli/action/add_result.rb +26 -0
  6. data/lib/miga/cli/action/console.rb +19 -0
  7. data/lib/miga/cli/action/daemon.rb +74 -0
  8. data/lib/miga/cli/action/date.rb +18 -0
  9. data/lib/miga/cli/action/doctor.rb +210 -0
  10. data/lib/miga/cli/action/edit.rb +24 -0
  11. data/lib/miga/cli/action/files.rb +31 -0
  12. data/lib/miga/cli/action/find.rb +48 -0
  13. data/lib/miga/cli/action/generic.rb +44 -0
  14. data/lib/miga/cli/action/get.rb +132 -0
  15. data/lib/miga/cli/action/init.rb +343 -0
  16. data/lib/miga/cli/action/ln.rb +42 -0
  17. data/lib/miga/cli/action/ls.rb +55 -0
  18. data/lib/miga/cli/action/ncbi_get.rb +218 -0
  19. data/lib/miga/cli/action/new.rb +45 -0
  20. data/lib/miga/cli/action/next_step.rb +27 -0
  21. data/lib/miga/cli/action/plugins.rb +28 -0
  22. data/lib/miga/cli/action/rm.rb +25 -0
  23. data/lib/miga/cli/action/run.rb +39 -0
  24. data/lib/miga/cli/action/stats.rb +140 -0
  25. data/lib/miga/cli/action/summary.rb +49 -0
  26. data/lib/miga/cli/action/tax_dist.rb +102 -0
  27. data/lib/miga/cli/action/tax_index.rb +47 -0
  28. data/lib/miga/cli/action/tax_set.rb +59 -0
  29. data/lib/miga/cli/action/tax_test.rb +77 -0
  30. data/lib/miga/cli/action.rb +66 -0
  31. data/lib/miga/cli/base.rb +90 -0
  32. data/lib/miga/cli.rb +426 -0
  33. data/lib/miga/project/result.rb +14 -6
  34. data/lib/miga/remote_dataset.rb +1 -1
  35. data/lib/miga/tax_index.rb +5 -4
  36. data/lib/miga/taxonomy/base.rb +63 -0
  37. data/lib/miga/taxonomy.rb +87 -92
  38. data/lib/miga/version.rb +6 -6
  39. data/test/taxonomy_test.rb +49 -9
  40. data/utils/distance/commands.rb +11 -11
  41. data/utils/distance/pipeline.rb +5 -5
  42. metadata +43 -49
  43. data/actions/about.rb +0 -43
  44. data/actions/add.rb +0 -129
  45. data/actions/add_result.rb +0 -30
  46. data/actions/daemon.rb +0 -55
  47. data/actions/date.rb +0 -14
  48. data/actions/doctor.rb +0 -201
  49. data/actions/edit.rb +0 -33
  50. data/actions/files.rb +0 -43
  51. data/actions/find.rb +0 -41
  52. data/actions/get.rb +0 -105
  53. data/actions/init.rb +0 -301
  54. data/actions/ln.rb +0 -47
  55. data/actions/ls.rb +0 -61
  56. data/actions/ncbi_get.rb +0 -192
  57. data/actions/new.rb +0 -44
  58. data/actions/next_step.rb +0 -33
  59. data/actions/plugins.rb +0 -25
  60. data/actions/rm.rb +0 -29
  61. data/actions/run.rb +0 -45
  62. data/actions/stats.rb +0 -149
  63. data/actions/summary.rb +0 -57
  64. data/actions/tax_dist.rb +0 -106
  65. data/actions/tax_index.rb +0 -46
  66. data/actions/tax_set.rb +0 -63
  67. data/actions/tax_test.rb +0 -80
@@ -0,0 +1,343 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+ require 'shellwords'
6
+
7
+ class MiGA::Cli::Action::Init < MiGA::Cli::Action
8
+
9
+ def parse_cli
10
+ cli.interactive = true
11
+ cli.defaults = {mytaxa: nil,
12
+ config: File.expand_path('.miga_modules', ENV['HOME']),
13
+ ask: false, auto: false, dtype: :bash}
14
+ cli.parse do |opt|
15
+ opt.on(
16
+ '-c', '--config PATH',
17
+ 'Path to the Bash configuration file',
18
+ "By default: #{cli[:config]}"
19
+ ){ |v| cli[:config] = v }
20
+ opt.on(
21
+ '--[no-]mytaxa',
22
+ 'Should I try setting up MyTaxa its dependencies?',
23
+ 'By default: interactive (true if --auto)'
24
+ ){ |v| cli[:mytaxa] = v }
25
+ opt.on(
26
+ '--daemon-type STRING',
27
+ 'Type of daemon launcher, one of: bash, qsub, msub, slurm',
28
+ "By default: interactive (#{cli[:dtype]} if --auto)"
29
+ ){ |v| cli[:dtype] = v.to_sym }
30
+ opt.on(
31
+ '--ask-all',
32
+ 'Ask for the location of all software',
33
+ 'By default, only the locations missing in PATH are requested'
34
+ ){ |v| cli[:ask] = v }
35
+ end
36
+ end
37
+
38
+ def perform
39
+ miga = MiGA.root_path
40
+ cli.puts <<BANNER
41
+ ===[ Welcome to MiGA, the Microbial Genome Atlas ]===
42
+
43
+ I'm the initialization script, and I'll sniff around your computer to
44
+ make sure you have all the requirements for MiGA data processing.
45
+
46
+ BANNER
47
+
48
+ if cli.ask_user(
49
+ 'Would you like to see all the requirements before starting?',
50
+ 'no', %w(yes no)) == 'yes'
51
+ cli.puts ''
52
+ File.open(File.expand_path('utils/requirements.txt', miga), 'r') do |fh|
53
+ fh.each_line { |ln| cli.puts ln }
54
+ end
55
+ cli.puts ''
56
+ end
57
+
58
+ rc_path = File.expand_path('.miga_rc', ENV['HOME'])
59
+ if File.exist? rc_path
60
+ if cli.ask_user(
61
+ 'I found a previous configuration. Do you want to continue?',
62
+ 'yes', %w(yes no)) == 'no'
63
+ cli.puts 'OK, see you soon!'
64
+ exit(0)
65
+ end
66
+ end
67
+ rc_fh = File.open(rc_path, 'w')
68
+ rc_fh.puts <<BASH
69
+ #!/bin/bash
70
+ # `miga init` made this on #{Time.now}
71
+
72
+ BASH
73
+
74
+ # Check bash configuration file
75
+ unless File.exist? cli[:config]
76
+ cli[:config] = cli.ask_user(
77
+ 'Is there a script I need to load at startup?',
78
+ cli[:config])
79
+ end
80
+ if File.exist? cli[:config]
81
+ cli[:config] = File.expand_path(cli[:config])
82
+ cli.puts "Found bash configuration script: #{cli[:config]}"
83
+ rc_fh.puts "MIGA_STARTUP='#{cli[:config]}'"
84
+ rc_fh.puts '. "$MIGA_STARTUP"'
85
+ else
86
+ cli[:config] = '/dev/null'
87
+ end
88
+ cli.puts ''
89
+
90
+ # Check for software requirements
91
+ cli.puts 'Looking for requirements:'
92
+ if cli[:mytaxa].nil?
93
+ cli[:mytaxa] = cli.ask_user(
94
+ 'Should I include MyTaxa modules?',
95
+ 'yes', %w(yes no)) == 'yes'
96
+ end
97
+ rc_fh.puts 'export MIGA_MYTAXA="no"' unless cli[:mytaxa]
98
+ paths = {}
99
+ rc_fh.puts 'MIGA_PATH=""'
100
+ File.open(File.expand_path('utils/requirements.txt', miga), 'r') do |fh|
101
+ fh.each_line do |ln|
102
+ next if $. < 3
103
+ r = ln.chomp.split(/\t+/)
104
+ next if r[0] =~ /\(opt\)$/ && !cli[:mytaxa]
105
+ cli.print "Testing #{r[0]}#{" (#{r[3]})" if r[3]}... "
106
+ path = nil
107
+ loop do
108
+ d_path = File.dirname(run_cmd(cli, "which #{r[1].shellescape}"))
109
+ if cli[:ask] || d_path == '.'
110
+ path = cli.ask_user('Where can I find it?', d_path, nil, true)
111
+ else
112
+ path = d_path
113
+ cli.puts path
114
+ end
115
+ if File.executable?(File.expand_path(r[1], path))
116
+ if d_path != path
117
+ rc_fh.puts "MIGA_PATH=\"#{path}:$MIGA_PATH\" # #{r[1]}"
118
+ end
119
+ break
120
+ end
121
+ cli.print "I cannot find #{r[1]} "
122
+ end
123
+ paths[r[1]] = File.expand_path(r[1], path).shellescape
124
+ end
125
+ end
126
+ rc_fh.puts 'export PATH="$MIGA_PATH$PATH"'
127
+ cli.puts ''
128
+
129
+ # Check for other files
130
+ if cli[:mytaxa]
131
+ cli.puts 'Looking for MyTaxa databases:'
132
+ mt = File.dirname paths["MyTaxa"]
133
+ cli.print 'Looking for scores... '
134
+ unless Dir.exist?(File.expand_path('db', mt))
135
+ cli.puts "no.\nExecute 'python2 #{mt}/utils/download_db.py'."
136
+ exit(1)
137
+ end
138
+ cli.puts 'yes.'
139
+ cli.print 'Looking for diamond db... '
140
+ unless File.exist?(File.expand_path('AllGenomes.faa.dmnd', mt))
141
+ cli.puts "no.\nDownload " \
142
+ "'http://enve-omics.ce.gatech.edu/data/public_mytaxa/" \
143
+ "AllGenomes.faa.dmnd' into #{mt}."
144
+ exit(1)
145
+ end
146
+ cli.puts ''
147
+ end
148
+
149
+ # Check for R packages
150
+ cli.puts 'Looking for R packages:'
151
+ %w(enveomics.R ape cluster vegan).each do |pkg|
152
+ cli.print "Testing #{pkg}... "
153
+ if test_r_package(cli, paths, pkg)
154
+ cli.puts 'yes.'
155
+ else
156
+ cli.puts 'no, installing'
157
+ cli.print '' + install_r_package(cli, paths, pkg)
158
+ unless test_r_package(cli, paths, pkg)
159
+ raise "Unable to auto-install R package: #{pkg}"
160
+ end
161
+ end
162
+ end
163
+ cli.puts ''
164
+
165
+ # Check for Ruby gems
166
+ cli.puts 'Looking for Ruby gems:'
167
+ %w(sqlite3 daemons json).each do |pkg|
168
+ cli.print "Testing #{pkg}... "
169
+ if test_ruby_gem(cli, paths, pkg)
170
+ cli.puts 'yes.'
171
+ else
172
+ cli.puts 'no, installing'
173
+ # This hackey mess is meant to ensure the test and installation are done
174
+ # on the configuration Ruby, not on the Ruby currently executing the init
175
+ # action
176
+ cli.print install_ruby_gem(cli, paths, pkg)
177
+ unless test_ruby_gem(cli, paths, pkg)
178
+ raise "Unable to auto-install Ruby gem: #{pkg}"
179
+ end
180
+ end
181
+ end
182
+ cli.puts ''
183
+
184
+ # Configure daemon
185
+ cli.puts 'Default daemon configuration:'
186
+ daemon_f = File.expand_path('.miga_daemon.json', ENV['HOME'])
187
+ unless File.exist?(daemon_f) and cli.ask_user(
188
+ 'A template daemon already exists, do you want to preserve it?',
189
+ 'yes', %w(yes no)) == 'yes'
190
+ v = {created: Time.now.to_s, updated: Time.now.to_s}
191
+ v[:type] = cli.ask_user(
192
+ 'Please select the type of daemon you want to setup',
193
+ cli[:dtype], %w(bash qsub msub slurm))
194
+ case v[:type]
195
+ when 'bash'
196
+ v[:latency] = cli.ask_user(
197
+ 'How long should I sleep? (in seconds)', '30').to_i
198
+ v[:maxjobs] = cli.ask_user(
199
+ 'How many jobs can I launch at once?', '6').to_i
200
+ v[:ppn] = cli.ask_user(
201
+ 'How many CPUs can I use per job?', '2').to_i
202
+ cli.puts 'Setting up internal daemon defaults.'
203
+ cli.puts 'If you don\'t understand this just leave default values:'
204
+ v[:cmd] = cli.ask_user(
205
+ "How should I launch tasks?\n %1$s: script path, " \
206
+ "%2$s: variables, %3$d: CPUs, %4$s: log file, %5$s: task name.\n",
207
+ "%2$s '%1$s' > '%4$s' 2>&1")
208
+ v[:var] = cli.ask_user(
209
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
210
+ "%1$s=%2$s")
211
+ v[:varsep] = cli.ask_user(
212
+ 'What should I use to separate variables?', ' ')
213
+ v[:alive] = cli.ask_user(
214
+ "How can I know that a process is still alive?\n %1$s: PID, " \
215
+ "output should be 1 for running and 0 for non-running.\n",
216
+ "ps -p '%1$s'|tail -n+2|wc -l")
217
+ v[:kill] = cli.ask_user(
218
+ "How should I terminate tasks?\n %s: process ID.", "kill -9 '%s'")
219
+ when 'slurm'
220
+ queue = cli.ask_user(
221
+ 'What queue should I use?', nil, nil, true)
222
+ v[:latency] = cli.ask_user(
223
+ 'How long should I sleep? (in seconds)', '150').to_i
224
+ v[:maxjobs] = cli.ask_user(
225
+ 'How many jobs can I launch at once?', '300').to_i
226
+ v[:ppn] = cli.ask_user(
227
+ 'How many CPUs can I use per job?', '2').to_i
228
+ cli.puts 'Setting up internal daemon defaults'
229
+ cli.puts 'If you don\'t understand this just leave default values:'
230
+ v[:cmd] = cli.ask_user(
231
+ "How should I launch tasks?\n %1$s: script path, " \
232
+ "%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
233
+ "%2$s sbatch --partition='#{queue}' --export=ALL " \
234
+ "--nodes=1 --ntasks-per-node=%3$d --output='%4$s' " \
235
+ "--job-name='%5$s' --mem=9G --time=12:00:00 %1$s " \
236
+ "| perl -pe 's/.* //'")
237
+ v[:var] = cli.ask_user(
238
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
239
+ "%1$s=%2$s")
240
+ v[:varsep] = cli.ask_user(
241
+ 'What should I use to separate variables?', ' ')
242
+ v[:alive] = cli.ask_user(
243
+ "How can I know that a process is still alive?\n %1$s: job id, " \
244
+ "output should be 1 for running and 0 for non-running.\n",
245
+ "squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " \
246
+ "| tail -n 1 | wc -l")
247
+ v[:kill] = cli.ask_user(
248
+ "How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
249
+ else # [qm]sub
250
+ queue = cli.ask_user('What queue should I use?', nil, nil, true)
251
+ v[:latency] = cli.ask_user(
252
+ 'How long should I sleep? (in seconds)', '150').to_i
253
+ v[:maxjobs] = cli.ask_user(
254
+ 'How many jobs can I launch at once?', '300').to_i
255
+ v[:ppn] = cli.ask_user(
256
+ 'How many CPUs can I use per job?', '2').to_i
257
+ cli.puts 'Setting up internal daemon defaults.'
258
+ cli.puts 'If you don\'t understand this just leave default values:'
259
+ v[:cmd] = cli.ask_user(
260
+ "How should I launch tasks?\n %1$s: script path, " \
261
+ "%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
262
+ "#{v[:type]} -q '#{queue}' -v '%2$s' -l nodes=1:ppn=%3$d %1$s " \
263
+ "-j oe -o '%4$s' -N '%5$s' -l mem=9g -l walltime=12:00:00 " \
264
+ "| grep .")
265
+ v[:var] = cli.ask_user(
266
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
267
+ "%1$s=%2$s")
268
+ v[:varsep] = cli.ask_user(
269
+ 'What should I use to separate variables?', ',')
270
+ if v[:type] == 'qsub'
271
+ v[:alive] = cli.ask_user(
272
+ "How can I know that a process is still alive?\n " \
273
+ "%1$s: job id, output should be 1 for running and " \
274
+ "0 for non-running.\n",
275
+ "qstat -f '%1$s'|grep ' job_state ='|perl -pe 's/.*= //'" \
276
+ "|grep '[^C]'|tail -n1|wc -l|awk '{print $1}'")
277
+ v[:kill] = cli.ask_user(
278
+ "How should I terminate tasks?\n %s: process ID.", "qdel '%s'")
279
+ else # msub
280
+ v[:alive] = cli.ask_user(
281
+ "How can I know that a process is still alive?\n " \
282
+ "%1$s: job id, output should be 1 for running and " \
283
+ "0 for non-running.\n",
284
+ "checkjob '%1$s'|grep '^State:'|perl -pe 's/.*: //'" \
285
+ "|grep 'Deferred\\|Hold\\|Idle\\|Starting\\|Running\\|Blocked'" \
286
+ "|tail -n1|wc -l|awk '{print $1}'")
287
+ v[:kill] = cli.ask_user(
288
+ "How should I terminate tasks?\n %s: process ID.",
289
+ "canceljob '%s'")
290
+ end
291
+ end
292
+ File.open(daemon_f, 'w') { |fh| fh.puts JSON.pretty_generate(v) }
293
+ end
294
+ cli.puts ''
295
+
296
+ rc_fh.puts <<FOOT
297
+
298
+ MIGA_CONFIG_VERSION='#{MiGA::MiGA.VERSION}'
299
+ MIGA_CONFIG_LONGVERSION='#{MiGA::MiGA.LONG_VERSION}'
300
+ MIGA_CONFIG_DATE='#{Time.now}'
301
+
302
+ FOOT
303
+
304
+ cli.puts 'Configuration complete. MiGA is ready to work!'
305
+ cli.puts ''
306
+
307
+ end
308
+
309
+ def empty_action
310
+ end
311
+
312
+ def run_cmd(cli, cmd)
313
+ `. "#{cli[:config]}" && #{cmd}`
314
+ end
315
+
316
+ def run_r_cmd(cli, paths, cmd)
317
+ run_cmd(cli,
318
+ "echo #{cmd.shellescape} | #{paths['R'].shellescape} --vanilla -q 2>&1")
319
+ end
320
+
321
+ def test_r_package(cli, paths, pkg)
322
+ run_r_cmd(cli, paths, "library('#{pkg}')")
323
+ $?.success?
324
+ end
325
+
326
+ def install_r_package(cli, paths, pkg)
327
+ r_cmd = "install.packages('#{pkg}', repos='http://cran.rstudio.com/')"
328
+ run_r_cmd(cli, paths, r_cmd)
329
+ end
330
+
331
+ def test_ruby_gem(cli, paths, pkg)
332
+ run_cmd(cli,
333
+ "#{paths['ruby'].shellescape} -r #{pkg.shellescape} -e '' 2>/dev/null")
334
+ $?.success?
335
+ end
336
+
337
+ def install_ruby_gem(cli, paths, pkg)
338
+ gem_cmd = "Gem::GemRunner.new.run %w(install --user #{pkg})"
339
+ run_cmd(cli, "#{paths['ruby'].shellescape} \
340
+ -r rubygems -r rubygems/gem_runner \
341
+ -e #{gem_cmd.shellescape} 2>&1")
342
+ end
343
+ end
@@ -0,0 +1,42 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Ln < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.defaults = {info: false, force: false, method: :hardlink}
10
+ cli.parse do |opt|
11
+ cli.opt_object(opt, [:project, :dataset_opt])
12
+ opt.on(
13
+ '-Q', '--project-target PATH',
14
+ '(Mandatory) Path to the project where to link the dataset'
15
+ ){ |v| cli[:project2] = v }
16
+ opt.on(
17
+ '-f', '--force',
18
+ 'Force linking, even if dataset\'s preprocessing is incomplete'
19
+ ){ |v| cli[:force] = v }
20
+ opt.on(
21
+ '-s', '--symlink',
22
+ 'Create symlinks instead of the default hard links'
23
+ ){ cli[:method] = :symlink }
24
+ opt.on(
25
+ '-c', '--copy',
26
+ 'Create copies instead of the default hard links'
27
+ ){ cli[:method] = :copy }
28
+ cli.opt_filter_datasets(opt)
29
+ end
30
+ end
31
+
32
+ def perform
33
+ p = cli.load_project
34
+ q = cli.load_project(:project2, '-Q')
35
+ ds = cli.load_and_filter_datasets
36
+ ds.each do |d|
37
+ next unless cli[:force] or d.done_preprocessing?
38
+ cli.puts d.name
39
+ q.import_dataset(d, cli[:method])
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,55 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Ls < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.defaults = {info: false, processing: false, silent: false}
10
+ cli.parse do |opt|
11
+ cli.opt_object(opt, [:project, :dataset_opt])
12
+ cli.opt_filter_datasets(opt)
13
+ opt.on(
14
+ '-i', '--info',
15
+ 'Print additional information on each dataset'
16
+ ){ |v| cli[:info] = v }
17
+ opt.on(
18
+ '-p', '--processing',
19
+ 'Print information on processing advance'
20
+ ){ |v| cli[:processing] = v }
21
+ opt.on(
22
+ '-m', '--metadata STRING',
23
+ 'Print name and metadata field only',
24
+ 'If set, ignores -i and assumes --tab'
25
+ ){ |v| cli[:datum] = v }
26
+ opt.on(
27
+ '--tab',
28
+ 'Return a tab-delimited table'
29
+ ){ |v| cli[:tabular] = v }
30
+ opt.on(
31
+ '-s', '--silent',
32
+ 'No output and exit with non-zero status if the dataset list is empty'
33
+ ){ |v| cli[:silent] = v }
34
+ end
35
+ end
36
+
37
+ def perform
38
+ ds = cli.load_and_filter_datasets(cli[:silent])
39
+ exit(ds.empty? ? 1 : 0) if cli[:silent]
40
+ if !cli[:datum].nil?
41
+ ds.each do |d|
42
+ v = d.metadata[cli[:datum]]
43
+ puts "#{d.name}\t#{v.nil? ? '?' : v}"
44
+ end
45
+ elsif cli[:info]
46
+ cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info })
47
+ elsif cli[:processing]
48
+ comp = %w[- done queued]
49
+ cli.table([:name] + MiGA::Dataset.PREPROCESSING_TASKS,
50
+ ds.map { |d| [d.name] + d.profile_advance.map { |i| comp[i] } })
51
+ else
52
+ ds.each { |d| cli.puts d.name }
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,218 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+ require 'miga/remote_dataset'
6
+ require 'csv'
7
+
8
+ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
9
+
10
+ def parse_cli
11
+ cli.defaults = {query: false, unlink: false,
12
+ reference: false, legacy_name: false,
13
+ complete: false, chromosome: false,
14
+ scaffold: false, contig: false, add_version: true, dry: false,
15
+ get_md: false, only_md: false, save_every: 1}
16
+ cli.parse do |opt|
17
+ cli.opt_object(opt, [:project])
18
+ opt.on(
19
+ '-T', '--taxon STRING',
20
+ '(Mandatory unless --reference) Taxon name (e.g., a species binomial)'
21
+ ){ |v| cli[:taxon] = v }
22
+ opt.on('--reference',
23
+ 'Download all reference genomes (ignore any other status)'
24
+ ){ |v| cli[:reference] = v }
25
+ opt.on(
26
+ '--complete',
27
+ 'Download complete genomes'
28
+ ){ |v| cli[:complete] = v }
29
+ opt.on('--chromosome',
30
+ 'Download complete chromosomes'
31
+ ){ |v| cli[:chromosome] = v }
32
+ opt.on(
33
+ '--scaffold',
34
+ 'Download genomes in scaffolds'
35
+ ){ |v| cli[:scaffold] = v }
36
+ opt.on(
37
+ '--contig',
38
+ 'Download genomes in contigs'
39
+ ){ |v| cli[:contig] = v }
40
+ opt.on(
41
+ '--all',
42
+ 'Download all genomes (in any status)') do
43
+ cli[:complete] = true
44
+ cli[:chromosome] = true
45
+ cli[:scaffold] = true
46
+ cli[:contig] = true
47
+ end
48
+ opt.on(
49
+ '--no-version-name',
50
+ 'Do not add sequence version to the dataset name',
51
+ 'Only affects --complete and --chromosome'
52
+ ){ |v| cli[:add_version] = v }
53
+ opt.on(
54
+ '--legacy-name',
55
+ 'Use dataset names based on chromosome entries instead of assembly'
56
+ ){ |v| cli[:legacy_name] = v }
57
+ opt.on('--blacklist PATH',
58
+ 'A file with dataset names to blacklist'
59
+ ){ |v| cli[:blacklist] = v }
60
+ opt.on(
61
+ '--dry',
62
+ 'Do not download or save the datasets'
63
+ ){ |v| cli[:dry] = v }
64
+ opt.on(
65
+ '--get-metadata',
66
+ 'Only download and update metadata for existing datasets'
67
+ ){ |v| cli[:get_md] = v }
68
+ opt.on('--only-metadata',
69
+ 'Create datasets without input data but retrieve all metadata'
70
+ ){ |v| cli[:only_md] = v }
71
+ opt.on(
72
+ '--save-every INT', Integer,
73
+ 'Save project every this many downloaded datasets',
74
+ 'If zero, it saves the project only once upon completion',
75
+ "By default: #{cli[:save_every]}"
76
+ ){ |v| cli[:save_every] = v }
77
+ opt.on(
78
+ '-q', '--query',
79
+ 'Register the datasets as queries, not reference datasets'
80
+ ){ |v| cli[:query] = v }
81
+ opt.on(
82
+ '-u', '--unlink',
83
+ 'Unlink all datasets in the project missing from the download list'
84
+ ){ |v| cli[:unlink] = v }
85
+ opt.on('-R', '--remote-list PATH',
86
+ 'Path to an output file with the list of all datasets listed remotely'
87
+ ){ |v| cli[:remote_list] = v }
88
+ opt.on(
89
+ '--api-key STRING',
90
+ 'NCBI API key'
91
+ ){ |v| ENV['NCBI_API_KEY'] = v }
92
+ end
93
+ end
94
+
95
+ def perform
96
+ cli.ensure_par(taxon: '-T') unless cli[:reference]
97
+ unless %w[reference complete chromosome scaffold contig].any?{ |i| cli[i.to_sym] }
98
+ raise 'No action requested: pick at least one type of genome'
99
+ end
100
+ cli[:save_every] = 1 if cli[:dry]
101
+
102
+ p = cli.load_project
103
+ d = []
104
+ ds = {}
105
+ downloaded = 0
106
+
107
+ url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
108
+ url_param = {
109
+ q: '[display()].' +
110
+ 'from(GenomeAssemblies).' +
111
+ 'usingschema(/schema/GenomeAssemblies).' +
112
+ 'matching(tab==["Prokaryotes"] and q=="' + cli[:taxon].tr('"',"'") + '"',
113
+ fields: 'organism|organism,assembly|assembly,replicons|replicons,' +
114
+ 'level|level,ftp_path_genbank|ftp_path_genbank,release_date|release_date,' +
115
+ 'strain|strain',
116
+ nolimit: 'on',
117
+ }
118
+ if cli[:reference]
119
+ url_param[:q] += ' and refseq_category==["representative"]'
120
+ else
121
+ status = {
122
+ complete: 'Complete',
123
+ chromosome: ' Chromosome', # <- The leading space is *VERY* important!
124
+ scaffold: 'Scaffold',
125
+ contig: 'Contig'
126
+ }.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
127
+ url_param[:q] += ' and level==[' + status + ']'
128
+ end
129
+ url_param[:q] += ')'
130
+ url = url_base + URI.encode_www_form(url_param)
131
+ cli.say 'Downloading genome list'
132
+ lineno = 0
133
+ doc = RemoteDataset.download_url(url)
134
+ CSV.parse(doc, headers: true).each do |r|
135
+ asm = r['assembly']
136
+ next if asm.nil? or asm.empty? or asm == '-'
137
+ next unless r['ftp_path_genbank']
138
+
139
+ # Get replicons
140
+ rep = r['replicons'].nil? ? nil : r['replicons'].
141
+ split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
142
+
143
+ # Set name
144
+ if cli[:legacy_name] and cli[:reference]
145
+ n = r['#organism'].miga_name
146
+ else
147
+ if cli[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
148
+ acc = rep.nil? ? '' : rep.first
149
+ else
150
+ acc = asm
151
+ end
152
+ acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
153
+ n = "#{r['#organism']}_#{acc}".miga_name
154
+ end
155
+
156
+ # Register for download
157
+ fna_url = r['ftp_path_genbank'] + '/' +
158
+ File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
159
+ ds[n] = {
160
+ ids: [fna_url], db: :assembly_gz, universe: :web,
161
+ md: {
162
+ type: :genome, ncbi_asm: asm, strain: r['strain']
163
+ }
164
+ }
165
+ ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
166
+ ds[n][:md][:release_date] =
167
+ Time.parse(r['release_date']).to_s unless r['release_date'].nil?
168
+ end
169
+
170
+ # Discard blacklisted
171
+ unless cli[:blacklist].nil?
172
+ cli.say "Discarding datasets in #{cli[:blacklist]}"
173
+ File.readlines(cli[:blacklist]).
174
+ select{ |i| i !~ /^#/ }.map(&:chomp).each{ |i| ds.delete i }
175
+ end
176
+
177
+ # Download entries
178
+ cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
179
+ p.do_not_save = true if cli[:save_every] != 1
180
+ ds.each do |name, body|
181
+ d << name
182
+ cli.puts name
183
+ next if p.dataset(name).nil? == cli[:get_md]
184
+ downloaded += 1
185
+ next if cli[:dry]
186
+ cli.say ' Locating remote dataset'
187
+ body[:md][:metadata_only] = true if cli[:only_md]
188
+ rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
189
+ if cli[:get_md]
190
+ cli.say ' Updating dataset'
191
+ rd.update_metadata(p.dataset(name), body[:md])
192
+ else
193
+ cli.say ' Creating dataset'
194
+ rd.save_to(p, name, !cli[:query], body[:md])
195
+ p.add_dataset(name)
196
+ end
197
+ p.save! if cli[:save_every] > 1 and (downloaded % cli[:save_every]) == 0
198
+ end
199
+
200
+ p.do_not_save = false
201
+ p.save! if cli[:save_every] != 1
202
+
203
+ # Finalize
204
+ cli.say "Datasets listed: #{d.size}"
205
+ cli.say "Datasets #{cli[:dry] ? 'to download' : 'downloaded'}: " +
206
+ downloaded.to_s
207
+ unless cli[:remote_list].nil?
208
+ File.open(cli[:remote_list], 'w') do |fh|
209
+ d.each { |i| fh.puts i }
210
+ end
211
+ end
212
+ if cli[:unlink]
213
+ unlink = p.dataset_names - d
214
+ unlink.each { |i| p.unlink_dataset(i).remove! }
215
+ cli.say "Datasets unlinked: #{unlink.size}"
216
+ end
217
+ end
218
+ end