miga-base 0.4.1.0 → 0.4.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/bin/miga +2 -244
  3. data/lib/miga/cli/action/about.rb +44 -0
  4. data/lib/miga/cli/action/add.rb +139 -0
  5. data/lib/miga/cli/action/add_result.rb +26 -0
  6. data/lib/miga/cli/action/console.rb +19 -0
  7. data/lib/miga/cli/action/daemon.rb +74 -0
  8. data/lib/miga/cli/action/date.rb +18 -0
  9. data/lib/miga/cli/action/doctor.rb +210 -0
  10. data/lib/miga/cli/action/edit.rb +24 -0
  11. data/lib/miga/cli/action/files.rb +31 -0
  12. data/lib/miga/cli/action/find.rb +48 -0
  13. data/lib/miga/cli/action/generic.rb +44 -0
  14. data/lib/miga/cli/action/get.rb +132 -0
  15. data/lib/miga/cli/action/init.rb +343 -0
  16. data/lib/miga/cli/action/ln.rb +42 -0
  17. data/lib/miga/cli/action/ls.rb +55 -0
  18. data/lib/miga/cli/action/ncbi_get.rb +218 -0
  19. data/lib/miga/cli/action/new.rb +45 -0
  20. data/lib/miga/cli/action/next_step.rb +27 -0
  21. data/lib/miga/cli/action/plugins.rb +28 -0
  22. data/lib/miga/cli/action/rm.rb +25 -0
  23. data/lib/miga/cli/action/run.rb +39 -0
  24. data/lib/miga/cli/action/stats.rb +140 -0
  25. data/lib/miga/cli/action/summary.rb +49 -0
  26. data/lib/miga/cli/action/tax_dist.rb +102 -0
  27. data/lib/miga/cli/action/tax_index.rb +47 -0
  28. data/lib/miga/cli/action/tax_set.rb +59 -0
  29. data/lib/miga/cli/action/tax_test.rb +77 -0
  30. data/lib/miga/cli/action.rb +66 -0
  31. data/lib/miga/cli/base.rb +90 -0
  32. data/lib/miga/cli.rb +426 -0
  33. data/lib/miga/project/result.rb +14 -6
  34. data/lib/miga/remote_dataset.rb +1 -1
  35. data/lib/miga/tax_index.rb +5 -4
  36. data/lib/miga/taxonomy/base.rb +63 -0
  37. data/lib/miga/taxonomy.rb +87 -92
  38. data/lib/miga/version.rb +6 -6
  39. data/test/taxonomy_test.rb +49 -9
  40. data/utils/distance/commands.rb +11 -11
  41. data/utils/distance/pipeline.rb +5 -5
  42. metadata +43 -49
  43. data/actions/about.rb +0 -43
  44. data/actions/add.rb +0 -129
  45. data/actions/add_result.rb +0 -30
  46. data/actions/daemon.rb +0 -55
  47. data/actions/date.rb +0 -14
  48. data/actions/doctor.rb +0 -201
  49. data/actions/edit.rb +0 -33
  50. data/actions/files.rb +0 -43
  51. data/actions/find.rb +0 -41
  52. data/actions/get.rb +0 -105
  53. data/actions/init.rb +0 -301
  54. data/actions/ln.rb +0 -47
  55. data/actions/ls.rb +0 -61
  56. data/actions/ncbi_get.rb +0 -192
  57. data/actions/new.rb +0 -44
  58. data/actions/next_step.rb +0 -33
  59. data/actions/plugins.rb +0 -25
  60. data/actions/rm.rb +0 -29
  61. data/actions/run.rb +0 -45
  62. data/actions/stats.rb +0 -149
  63. data/actions/summary.rb +0 -57
  64. data/actions/tax_dist.rb +0 -106
  65. data/actions/tax_index.rb +0 -46
  66. data/actions/tax_set.rb +0 -63
  67. data/actions/tax_test.rb +0 -80
@@ -0,0 +1,343 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+ require 'shellwords'
6
+
7
+ class MiGA::Cli::Action::Init < MiGA::Cli::Action
8
+
9
+ def parse_cli
10
+ cli.interactive = true
11
+ cli.defaults = {mytaxa: nil,
12
+ config: File.expand_path('.miga_modules', ENV['HOME']),
13
+ ask: false, auto: false, dtype: :bash}
14
+ cli.parse do |opt|
15
+ opt.on(
16
+ '-c', '--config PATH',
17
+ 'Path to the Bash configuration file',
18
+ "By default: #{cli[:config]}"
19
+ ){ |v| cli[:config] = v }
20
+ opt.on(
21
+ '--[no-]mytaxa',
22
+ 'Should I try setting up MyTaxa its dependencies?',
23
+ 'By default: interactive (true if --auto)'
24
+ ){ |v| cli[:mytaxa] = v }
25
+ opt.on(
26
+ '--daemon-type STRING',
27
+ 'Type of daemon launcher, one of: bash, qsub, msub, slurm',
28
+ "By default: interactive (#{cli[:dtype]} if --auto)"
29
+ ){ |v| cli[:dtype] = v.to_sym }
30
+ opt.on(
31
+ '--ask-all',
32
+ 'Ask for the location of all software',
33
+ 'By default, only the locations missing in PATH are requested'
34
+ ){ |v| cli[:ask] = v }
35
+ end
36
+ end
37
+
38
+ def perform
39
+ miga = MiGA.root_path
40
+ cli.puts <<BANNER
41
+ ===[ Welcome to MiGA, the Microbial Genome Atlas ]===
42
+
43
+ I'm the initialization script, and I'll sniff around your computer to
44
+ make sure you have all the requirements for MiGA data processing.
45
+
46
+ BANNER
47
+
48
+ if cli.ask_user(
49
+ 'Would you like to see all the requirements before starting?',
50
+ 'no', %w(yes no)) == 'yes'
51
+ cli.puts ''
52
+ File.open(File.expand_path('utils/requirements.txt', miga), 'r') do |fh|
53
+ fh.each_line { |ln| cli.puts ln }
54
+ end
55
+ cli.puts ''
56
+ end
57
+
58
+ rc_path = File.expand_path('.miga_rc', ENV['HOME'])
59
+ if File.exist? rc_path
60
+ if cli.ask_user(
61
+ 'I found a previous configuration. Do you want to continue?',
62
+ 'yes', %w(yes no)) == 'no'
63
+ cli.puts 'OK, see you soon!'
64
+ exit(0)
65
+ end
66
+ end
67
+ rc_fh = File.open(rc_path, 'w')
68
+ rc_fh.puts <<BASH
69
+ #!/bin/bash
70
+ # `miga init` made this on #{Time.now}
71
+
72
+ BASH
73
+
74
+ # Check bash configuration file
75
+ unless File.exist? cli[:config]
76
+ cli[:config] = cli.ask_user(
77
+ 'Is there a script I need to load at startup?',
78
+ cli[:config])
79
+ end
80
+ if File.exist? cli[:config]
81
+ cli[:config] = File.expand_path(cli[:config])
82
+ cli.puts "Found bash configuration script: #{cli[:config]}"
83
+ rc_fh.puts "MIGA_STARTUP='#{cli[:config]}'"
84
+ rc_fh.puts '. "$MIGA_STARTUP"'
85
+ else
86
+ cli[:config] = '/dev/null'
87
+ end
88
+ cli.puts ''
89
+
90
+ # Check for software requirements
91
+ cli.puts 'Looking for requirements:'
92
+ if cli[:mytaxa].nil?
93
+ cli[:mytaxa] = cli.ask_user(
94
+ 'Should I include MyTaxa modules?',
95
+ 'yes', %w(yes no)) == 'yes'
96
+ end
97
+ rc_fh.puts 'export MIGA_MYTAXA="no"' unless cli[:mytaxa]
98
+ paths = {}
99
+ rc_fh.puts 'MIGA_PATH=""'
100
+ File.open(File.expand_path('utils/requirements.txt', miga), 'r') do |fh|
101
+ fh.each_line do |ln|
102
+ next if $. < 3
103
+ r = ln.chomp.split(/\t+/)
104
+ next if r[0] =~ /\(opt\)$/ && !cli[:mytaxa]
105
+ cli.print "Testing #{r[0]}#{" (#{r[3]})" if r[3]}... "
106
+ path = nil
107
+ loop do
108
+ d_path = File.dirname(run_cmd(cli, "which #{r[1].shellescape}"))
109
+ if cli[:ask] || d_path == '.'
110
+ path = cli.ask_user('Where can I find it?', d_path, nil, true)
111
+ else
112
+ path = d_path
113
+ cli.puts path
114
+ end
115
+ if File.executable?(File.expand_path(r[1], path))
116
+ if d_path != path
117
+ rc_fh.puts "MIGA_PATH=\"#{path}:$MIGA_PATH\" # #{r[1]}"
118
+ end
119
+ break
120
+ end
121
+ cli.print "I cannot find #{r[1]} "
122
+ end
123
+ paths[r[1]] = File.expand_path(r[1], path).shellescape
124
+ end
125
+ end
126
+ rc_fh.puts 'export PATH="$MIGA_PATH$PATH"'
127
+ cli.puts ''
128
+
129
+ # Check for other files
130
+ if cli[:mytaxa]
131
+ cli.puts 'Looking for MyTaxa databases:'
132
+ mt = File.dirname paths["MyTaxa"]
133
+ cli.print 'Looking for scores... '
134
+ unless Dir.exist?(File.expand_path('db', mt))
135
+ cli.puts "no.\nExecute 'python2 #{mt}/utils/download_db.py'."
136
+ exit(1)
137
+ end
138
+ cli.puts 'yes.'
139
+ cli.print 'Looking for diamond db... '
140
+ unless File.exist?(File.expand_path('AllGenomes.faa.dmnd', mt))
141
+ cli.puts "no.\nDownload " \
142
+ "'http://enve-omics.ce.gatech.edu/data/public_mytaxa/" \
143
+ "AllGenomes.faa.dmnd' into #{mt}."
144
+ exit(1)
145
+ end
146
+ cli.puts ''
147
+ end
148
+
149
+ # Check for R packages
150
+ cli.puts 'Looking for R packages:'
151
+ %w(enveomics.R ape cluster vegan).each do |pkg|
152
+ cli.print "Testing #{pkg}... "
153
+ if test_r_package(cli, paths, pkg)
154
+ cli.puts 'yes.'
155
+ else
156
+ cli.puts 'no, installing'
157
+ cli.print '' + install_r_package(cli, paths, pkg)
158
+ unless test_r_package(cli, paths, pkg)
159
+ raise "Unable to auto-install R package: #{pkg}"
160
+ end
161
+ end
162
+ end
163
+ cli.puts ''
164
+
165
+ # Check for Ruby gems
166
+ cli.puts 'Looking for Ruby gems:'
167
+ %w(sqlite3 daemons json).each do |pkg|
168
+ cli.print "Testing #{pkg}... "
169
+ if test_ruby_gem(cli, paths, pkg)
170
+ cli.puts 'yes.'
171
+ else
172
+ cli.puts 'no, installing'
173
+ # This hackey mess is meant to ensure the test and installation are done
174
+ # on the configuration Ruby, not on the Ruby currently executing the init
175
+ # action
176
+ cli.print install_ruby_gem(cli, paths, pkg)
177
+ unless test_ruby_gem(cli, paths, pkg)
178
+ raise "Unable to auto-install Ruby gem: #{pkg}"
179
+ end
180
+ end
181
+ end
182
+ cli.puts ''
183
+
184
+ # Configure daemon
185
+ cli.puts 'Default daemon configuration:'
186
+ daemon_f = File.expand_path('.miga_daemon.json', ENV['HOME'])
187
+ unless File.exist?(daemon_f) and cli.ask_user(
188
+ 'A template daemon already exists, do you want to preserve it?',
189
+ 'yes', %w(yes no)) == 'yes'
190
+ v = {created: Time.now.to_s, updated: Time.now.to_s}
191
+ v[:type] = cli.ask_user(
192
+ 'Please select the type of daemon you want to setup',
193
+ cli[:dtype], %w(bash qsub msub slurm))
194
+ case v[:type]
195
+ when 'bash'
196
+ v[:latency] = cli.ask_user(
197
+ 'How long should I sleep? (in seconds)', '30').to_i
198
+ v[:maxjobs] = cli.ask_user(
199
+ 'How many jobs can I launch at once?', '6').to_i
200
+ v[:ppn] = cli.ask_user(
201
+ 'How many CPUs can I use per job?', '2').to_i
202
+ cli.puts 'Setting up internal daemon defaults.'
203
+ cli.puts 'If you don\'t understand this just leave default values:'
204
+ v[:cmd] = cli.ask_user(
205
+ "How should I launch tasks?\n %1$s: script path, " \
206
+ "%2$s: variables, %3$d: CPUs, %4$s: log file, %5$s: task name.\n",
207
+ "%2$s '%1$s' > '%4$s' 2>&1")
208
+ v[:var] = cli.ask_user(
209
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
210
+ "%1$s=%2$s")
211
+ v[:varsep] = cli.ask_user(
212
+ 'What should I use to separate variables?', ' ')
213
+ v[:alive] = cli.ask_user(
214
+ "How can I know that a process is still alive?\n %1$s: PID, " \
215
+ "output should be 1 for running and 0 for non-running.\n",
216
+ "ps -p '%1$s'|tail -n+2|wc -l")
217
+ v[:kill] = cli.ask_user(
218
+ "How should I terminate tasks?\n %s: process ID.", "kill -9 '%s'")
219
+ when 'slurm'
220
+ queue = cli.ask_user(
221
+ 'What queue should I use?', nil, nil, true)
222
+ v[:latency] = cli.ask_user(
223
+ 'How long should I sleep? (in seconds)', '150').to_i
224
+ v[:maxjobs] = cli.ask_user(
225
+ 'How many jobs can I launch at once?', '300').to_i
226
+ v[:ppn] = cli.ask_user(
227
+ 'How many CPUs can I use per job?', '2').to_i
228
+ cli.puts 'Setting up internal daemon defaults'
229
+ cli.puts 'If you don\'t understand this just leave default values:'
230
+ v[:cmd] = cli.ask_user(
231
+ "How should I launch tasks?\n %1$s: script path, " \
232
+ "%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
233
+ "%2$s sbatch --partition='#{queue}' --export=ALL " \
234
+ "--nodes=1 --ntasks-per-node=%3$d --output='%4$s' " \
235
+ "--job-name='%5$s' --mem=9G --time=12:00:00 %1$s " \
236
+ "| perl -pe 's/.* //'")
237
+ v[:var] = cli.ask_user(
238
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
239
+ "%1$s=%2$s")
240
+ v[:varsep] = cli.ask_user(
241
+ 'What should I use to separate variables?', ' ')
242
+ v[:alive] = cli.ask_user(
243
+ "How can I know that a process is still alive?\n %1$s: job id, " \
244
+ "output should be 1 for running and 0 for non-running.\n",
245
+ "squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " \
246
+ "| tail -n 1 | wc -l")
247
+ v[:kill] = cli.ask_user(
248
+ "How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
249
+ else # [qm]sub
250
+ queue = cli.ask_user('What queue should I use?', nil, nil, true)
251
+ v[:latency] = cli.ask_user(
252
+ 'How long should I sleep? (in seconds)', '150').to_i
253
+ v[:maxjobs] = cli.ask_user(
254
+ 'How many jobs can I launch at once?', '300').to_i
255
+ v[:ppn] = cli.ask_user(
256
+ 'How many CPUs can I use per job?', '2').to_i
257
+ cli.puts 'Setting up internal daemon defaults.'
258
+ cli.puts 'If you don\'t understand this just leave default values:'
259
+ v[:cmd] = cli.ask_user(
260
+ "How should I launch tasks?\n %1$s: script path, " \
261
+ "%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
262
+ "#{v[:type]} -q '#{queue}' -v '%2$s' -l nodes=1:ppn=%3$d %1$s " \
263
+ "-j oe -o '%4$s' -N '%5$s' -l mem=9g -l walltime=12:00:00 " \
264
+ "| grep .")
265
+ v[:var] = cli.ask_user(
266
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
267
+ "%1$s=%2$s")
268
+ v[:varsep] = cli.ask_user(
269
+ 'What should I use to separate variables?', ',')
270
+ if v[:type] == 'qsub'
271
+ v[:alive] = cli.ask_user(
272
+ "How can I know that a process is still alive?\n " \
273
+ "%1$s: job id, output should be 1 for running and " \
274
+ "0 for non-running.\n",
275
+ "qstat -f '%1$s'|grep ' job_state ='|perl -pe 's/.*= //'" \
276
+ "|grep '[^C]'|tail -n1|wc -l|awk '{print $1}'")
277
+ v[:kill] = cli.ask_user(
278
+ "How should I terminate tasks?\n %s: process ID.", "qdel '%s'")
279
+ else # msub
280
+ v[:alive] = cli.ask_user(
281
+ "How can I know that a process is still alive?\n " \
282
+ "%1$s: job id, output should be 1 for running and " \
283
+ "0 for non-running.\n",
284
+ "checkjob '%1$s'|grep '^State:'|perl -pe 's/.*: //'" \
285
+ "|grep 'Deferred\\|Hold\\|Idle\\|Starting\\|Running\\|Blocked'" \
286
+ "|tail -n1|wc -l|awk '{print $1}'")
287
+ v[:kill] = cli.ask_user(
288
+ "How should I terminate tasks?\n %s: process ID.",
289
+ "canceljob '%s'")
290
+ end
291
+ end
292
+ File.open(daemon_f, 'w') { |fh| fh.puts JSON.pretty_generate(v) }
293
+ end
294
+ cli.puts ''
295
+
296
+ rc_fh.puts <<FOOT
297
+
298
+ MIGA_CONFIG_VERSION='#{MiGA::MiGA.VERSION}'
299
+ MIGA_CONFIG_LONGVERSION='#{MiGA::MiGA.LONG_VERSION}'
300
+ MIGA_CONFIG_DATE='#{Time.now}'
301
+
302
+ FOOT
303
+
304
+ cli.puts 'Configuration complete. MiGA is ready to work!'
305
+ cli.puts ''
306
+
307
+ end
308
+
309
+ def empty_action
310
+ end
311
+
312
+ def run_cmd(cli, cmd)
313
+ `. "#{cli[:config]}" && #{cmd}`
314
+ end
315
+
316
+ def run_r_cmd(cli, paths, cmd)
317
+ run_cmd(cli,
318
+ "echo #{cmd.shellescape} | #{paths['R'].shellescape} --vanilla -q 2>&1")
319
+ end
320
+
321
+ def test_r_package(cli, paths, pkg)
322
+ run_r_cmd(cli, paths, "library('#{pkg}')")
323
+ $?.success?
324
+ end
325
+
326
+ def install_r_package(cli, paths, pkg)
327
+ r_cmd = "install.packages('#{pkg}', repos='http://cran.rstudio.com/')"
328
+ run_r_cmd(cli, paths, r_cmd)
329
+ end
330
+
331
+ def test_ruby_gem(cli, paths, pkg)
332
+ run_cmd(cli,
333
+ "#{paths['ruby'].shellescape} -r #{pkg.shellescape} -e '' 2>/dev/null")
334
+ $?.success?
335
+ end
336
+
337
+ def install_ruby_gem(cli, paths, pkg)
338
+ gem_cmd = "Gem::GemRunner.new.run %w(install --user #{pkg})"
339
+ run_cmd(cli, "#{paths['ruby'].shellescape} \
340
+ -r rubygems -r rubygems/gem_runner \
341
+ -e #{gem_cmd.shellescape} 2>&1")
342
+ end
343
+ end
@@ -0,0 +1,42 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Ln < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.defaults = {info: false, force: false, method: :hardlink}
10
+ cli.parse do |opt|
11
+ cli.opt_object(opt, [:project, :dataset_opt])
12
+ opt.on(
13
+ '-Q', '--project-target PATH',
14
+ '(Mandatory) Path to the project where to link the dataset'
15
+ ){ |v| cli[:project2] = v }
16
+ opt.on(
17
+ '-f', '--force',
18
+ 'Force linking, even if dataset\'s preprocessing is incomplete'
19
+ ){ |v| cli[:force] = v }
20
+ opt.on(
21
+ '-s', '--symlink',
22
+ 'Create symlinks instead of the default hard links'
23
+ ){ cli[:method] = :symlink }
24
+ opt.on(
25
+ '-c', '--copy',
26
+ 'Create copies instead of the default hard links'
27
+ ){ cli[:method] = :copy }
28
+ cli.opt_filter_datasets(opt)
29
+ end
30
+ end
31
+
32
+ def perform
33
+ p = cli.load_project
34
+ q = cli.load_project(:project2, '-Q')
35
+ ds = cli.load_and_filter_datasets
36
+ ds.each do |d|
37
+ next unless cli[:force] or d.done_preprocessing?
38
+ cli.puts d.name
39
+ q.import_dataset(d, cli[:method])
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,55 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+
6
+ class MiGA::Cli::Action::Ls < MiGA::Cli::Action
7
+
8
+ def parse_cli
9
+ cli.defaults = {info: false, processing: false, silent: false}
10
+ cli.parse do |opt|
11
+ cli.opt_object(opt, [:project, :dataset_opt])
12
+ cli.opt_filter_datasets(opt)
13
+ opt.on(
14
+ '-i', '--info',
15
+ 'Print additional information on each dataset'
16
+ ){ |v| cli[:info] = v }
17
+ opt.on(
18
+ '-p', '--processing',
19
+ 'Print information on processing advance'
20
+ ){ |v| cli[:processing] = v }
21
+ opt.on(
22
+ '-m', '--metadata STRING',
23
+ 'Print name and metadata field only',
24
+ 'If set, ignores -i and assumes --tab'
25
+ ){ |v| cli[:datum] = v }
26
+ opt.on(
27
+ '--tab',
28
+ 'Return a tab-delimited table'
29
+ ){ |v| cli[:tabular] = v }
30
+ opt.on(
31
+ '-s', '--silent',
32
+ 'No output and exit with non-zero status if the dataset list is empty'
33
+ ){ |v| cli[:silent] = v }
34
+ end
35
+ end
36
+
37
+ def perform
38
+ ds = cli.load_and_filter_datasets(cli[:silent])
39
+ exit(ds.empty? ? 1 : 0) if cli[:silent]
40
+ if !cli[:datum].nil?
41
+ ds.each do |d|
42
+ v = d.metadata[cli[:datum]]
43
+ puts "#{d.name}\t#{v.nil? ? '?' : v}"
44
+ end
45
+ elsif cli[:info]
46
+ cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info })
47
+ elsif cli[:processing]
48
+ comp = %w[- done queued]
49
+ cli.table([:name] + MiGA::Dataset.PREPROCESSING_TASKS,
50
+ ds.map { |d| [d.name] + d.profile_advance.map { |i| comp[i] } })
51
+ else
52
+ ds.each { |d| cli.puts d.name }
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,218 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ require 'miga/cli/action'
5
+ require 'miga/remote_dataset'
6
+ require 'csv'
7
+
8
+ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
9
+
10
+ def parse_cli
11
+ cli.defaults = {query: false, unlink: false,
12
+ reference: false, legacy_name: false,
13
+ complete: false, chromosome: false,
14
+ scaffold: false, contig: false, add_version: true, dry: false,
15
+ get_md: false, only_md: false, save_every: 1}
16
+ cli.parse do |opt|
17
+ cli.opt_object(opt, [:project])
18
+ opt.on(
19
+ '-T', '--taxon STRING',
20
+ '(Mandatory unless --reference) Taxon name (e.g., a species binomial)'
21
+ ){ |v| cli[:taxon] = v }
22
+ opt.on('--reference',
23
+ 'Download all reference genomes (ignore any other status)'
24
+ ){ |v| cli[:reference] = v }
25
+ opt.on(
26
+ '--complete',
27
+ 'Download complete genomes'
28
+ ){ |v| cli[:complete] = v }
29
+ opt.on('--chromosome',
30
+ 'Download complete chromosomes'
31
+ ){ |v| cli[:chromosome] = v }
32
+ opt.on(
33
+ '--scaffold',
34
+ 'Download genomes in scaffolds'
35
+ ){ |v| cli[:scaffold] = v }
36
+ opt.on(
37
+ '--contig',
38
+ 'Download genomes in contigs'
39
+ ){ |v| cli[:contig] = v }
40
+ opt.on(
41
+ '--all',
42
+ 'Download all genomes (in any status)') do
43
+ cli[:complete] = true
44
+ cli[:chromosome] = true
45
+ cli[:scaffold] = true
46
+ cli[:contig] = true
47
+ end
48
+ opt.on(
49
+ '--no-version-name',
50
+ 'Do not add sequence version to the dataset name',
51
+ 'Only affects --complete and --chromosome'
52
+ ){ |v| cli[:add_version] = v }
53
+ opt.on(
54
+ '--legacy-name',
55
+ 'Use dataset names based on chromosome entries instead of assembly'
56
+ ){ |v| cli[:legacy_name] = v }
57
+ opt.on('--blacklist PATH',
58
+ 'A file with dataset names to blacklist'
59
+ ){ |v| cli[:blacklist] = v }
60
+ opt.on(
61
+ '--dry',
62
+ 'Do not download or save the datasets'
63
+ ){ |v| cli[:dry] = v }
64
+ opt.on(
65
+ '--get-metadata',
66
+ 'Only download and update metadata for existing datasets'
67
+ ){ |v| cli[:get_md] = v }
68
+ opt.on('--only-metadata',
69
+ 'Create datasets without input data but retrieve all metadata'
70
+ ){ |v| cli[:only_md] = v }
71
+ opt.on(
72
+ '--save-every INT', Integer,
73
+ 'Save project every this many downloaded datasets',
74
+ 'If zero, it saves the project only once upon completion',
75
+ "By default: #{cli[:save_every]}"
76
+ ){ |v| cli[:save_every] = v }
77
+ opt.on(
78
+ '-q', '--query',
79
+ 'Register the datasets as queries, not reference datasets'
80
+ ){ |v| cli[:query] = v }
81
+ opt.on(
82
+ '-u', '--unlink',
83
+ 'Unlink all datasets in the project missing from the download list'
84
+ ){ |v| cli[:unlink] = v }
85
+ opt.on('-R', '--remote-list PATH',
86
+ 'Path to an output file with the list of all datasets listed remotely'
87
+ ){ |v| cli[:remote_list] = v }
88
+ opt.on(
89
+ '--api-key STRING',
90
+ 'NCBI API key'
91
+ ){ |v| ENV['NCBI_API_KEY'] = v }
92
+ end
93
+ end
94
+
95
+ def perform
96
+ cli.ensure_par(taxon: '-T') unless cli[:reference]
97
+ unless %w[reference complete chromosome scaffold contig].any?{ |i| cli[i.to_sym] }
98
+ raise 'No action requested: pick at least one type of genome'
99
+ end
100
+ cli[:save_every] = 1 if cli[:dry]
101
+
102
+ p = cli.load_project
103
+ d = []
104
+ ds = {}
105
+ downloaded = 0
106
+
107
+ url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
108
+ url_param = {
109
+ q: '[display()].' +
110
+ 'from(GenomeAssemblies).' +
111
+ 'usingschema(/schema/GenomeAssemblies).' +
112
+ 'matching(tab==["Prokaryotes"] and q=="' + cli[:taxon].tr('"',"'") + '"',
113
+ fields: 'organism|organism,assembly|assembly,replicons|replicons,' +
114
+ 'level|level,ftp_path_genbank|ftp_path_genbank,release_date|release_date,' +
115
+ 'strain|strain',
116
+ nolimit: 'on',
117
+ }
118
+ if cli[:reference]
119
+ url_param[:q] += ' and refseq_category==["representative"]'
120
+ else
121
+ status = {
122
+ complete: 'Complete',
123
+ chromosome: ' Chromosome', # <- The leading space is *VERY* important!
124
+ scaffold: 'Scaffold',
125
+ contig: 'Contig'
126
+ }.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
127
+ url_param[:q] += ' and level==[' + status + ']'
128
+ end
129
+ url_param[:q] += ')'
130
+ url = url_base + URI.encode_www_form(url_param)
131
+ cli.say 'Downloading genome list'
132
+ lineno = 0
133
+ doc = RemoteDataset.download_url(url)
134
+ CSV.parse(doc, headers: true).each do |r|
135
+ asm = r['assembly']
136
+ next if asm.nil? or asm.empty? or asm == '-'
137
+ next unless r['ftp_path_genbank']
138
+
139
+ # Get replicons
140
+ rep = r['replicons'].nil? ? nil : r['replicons'].
141
+ split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
142
+
143
+ # Set name
144
+ if cli[:legacy_name] and cli[:reference]
145
+ n = r['#organism'].miga_name
146
+ else
147
+ if cli[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
148
+ acc = rep.nil? ? '' : rep.first
149
+ else
150
+ acc = asm
151
+ end
152
+ acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
153
+ n = "#{r['#organism']}_#{acc}".miga_name
154
+ end
155
+
156
+ # Register for download
157
+ fna_url = r['ftp_path_genbank'] + '/' +
158
+ File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
159
+ ds[n] = {
160
+ ids: [fna_url], db: :assembly_gz, universe: :web,
161
+ md: {
162
+ type: :genome, ncbi_asm: asm, strain: r['strain']
163
+ }
164
+ }
165
+ ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
166
+ ds[n][:md][:release_date] =
167
+ Time.parse(r['release_date']).to_s unless r['release_date'].nil?
168
+ end
169
+
170
+ # Discard blacklisted
171
+ unless cli[:blacklist].nil?
172
+ cli.say "Discarding datasets in #{cli[:blacklist]}"
173
+ File.readlines(cli[:blacklist]).
174
+ select{ |i| i !~ /^#/ }.map(&:chomp).each{ |i| ds.delete i }
175
+ end
176
+
177
+ # Download entries
178
+ cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
179
+ p.do_not_save = true if cli[:save_every] != 1
180
+ ds.each do |name, body|
181
+ d << name
182
+ cli.puts name
183
+ next if p.dataset(name).nil? == cli[:get_md]
184
+ downloaded += 1
185
+ next if cli[:dry]
186
+ cli.say ' Locating remote dataset'
187
+ body[:md][:metadata_only] = true if cli[:only_md]
188
+ rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
189
+ if cli[:get_md]
190
+ cli.say ' Updating dataset'
191
+ rd.update_metadata(p.dataset(name), body[:md])
192
+ else
193
+ cli.say ' Creating dataset'
194
+ rd.save_to(p, name, !cli[:query], body[:md])
195
+ p.add_dataset(name)
196
+ end
197
+ p.save! if cli[:save_every] > 1 and (downloaded % cli[:save_every]) == 0
198
+ end
199
+
200
+ p.do_not_save = false
201
+ p.save! if cli[:save_every] != 1
202
+
203
+ # Finalize
204
+ cli.say "Datasets listed: #{d.size}"
205
+ cli.say "Datasets #{cli[:dry] ? 'to download' : 'downloaded'}: " +
206
+ downloaded.to_s
207
+ unless cli[:remote_list].nil?
208
+ File.open(cli[:remote_list], 'w') do |fh|
209
+ d.each { |i| fh.puts i }
210
+ end
211
+ end
212
+ if cli[:unlink]
213
+ unlink = p.dataset_names - d
214
+ unlink.each { |i| p.unlink_dataset(i).remove! }
215
+ cli.say "Datasets unlinked: #{unlink.size}"
216
+ end
217
+ end
218
+ end