miga-base 0.4.1.0 → 0.4.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/miga +2 -244
- data/lib/miga/cli/action/about.rb +44 -0
- data/lib/miga/cli/action/add.rb +139 -0
- data/lib/miga/cli/action/add_result.rb +26 -0
- data/lib/miga/cli/action/console.rb +19 -0
- data/lib/miga/cli/action/daemon.rb +74 -0
- data/lib/miga/cli/action/date.rb +18 -0
- data/lib/miga/cli/action/doctor.rb +210 -0
- data/lib/miga/cli/action/edit.rb +24 -0
- data/lib/miga/cli/action/files.rb +31 -0
- data/lib/miga/cli/action/find.rb +48 -0
- data/lib/miga/cli/action/generic.rb +44 -0
- data/lib/miga/cli/action/get.rb +132 -0
- data/lib/miga/cli/action/init.rb +343 -0
- data/lib/miga/cli/action/ln.rb +42 -0
- data/lib/miga/cli/action/ls.rb +55 -0
- data/lib/miga/cli/action/ncbi_get.rb +218 -0
- data/lib/miga/cli/action/new.rb +45 -0
- data/lib/miga/cli/action/next_step.rb +27 -0
- data/lib/miga/cli/action/plugins.rb +28 -0
- data/lib/miga/cli/action/rm.rb +25 -0
- data/lib/miga/cli/action/run.rb +39 -0
- data/lib/miga/cli/action/stats.rb +140 -0
- data/lib/miga/cli/action/summary.rb +49 -0
- data/lib/miga/cli/action/tax_dist.rb +102 -0
- data/lib/miga/cli/action/tax_index.rb +47 -0
- data/lib/miga/cli/action/tax_set.rb +59 -0
- data/lib/miga/cli/action/tax_test.rb +77 -0
- data/lib/miga/cli/action.rb +66 -0
- data/lib/miga/cli/base.rb +90 -0
- data/lib/miga/cli.rb +426 -0
- data/lib/miga/project/result.rb +14 -6
- data/lib/miga/remote_dataset.rb +1 -1
- data/lib/miga/tax_index.rb +5 -4
- data/lib/miga/taxonomy/base.rb +63 -0
- data/lib/miga/taxonomy.rb +87 -92
- data/lib/miga/version.rb +6 -6
- data/test/taxonomy_test.rb +49 -9
- data/utils/distance/commands.rb +11 -11
- data/utils/distance/pipeline.rb +5 -5
- metadata +43 -49
- data/actions/about.rb +0 -43
- data/actions/add.rb +0 -129
- data/actions/add_result.rb +0 -30
- data/actions/daemon.rb +0 -55
- data/actions/date.rb +0 -14
- data/actions/doctor.rb +0 -201
- data/actions/edit.rb +0 -33
- data/actions/files.rb +0 -43
- data/actions/find.rb +0 -41
- data/actions/get.rb +0 -105
- data/actions/init.rb +0 -301
- data/actions/ln.rb +0 -47
- data/actions/ls.rb +0 -61
- data/actions/ncbi_get.rb +0 -192
- data/actions/new.rb +0 -44
- data/actions/next_step.rb +0 -33
- data/actions/plugins.rb +0 -25
- data/actions/rm.rb +0 -29
- data/actions/run.rb +0 -45
- data/actions/stats.rb +0 -149
- data/actions/summary.rb +0 -57
- data/actions/tax_dist.rb +0 -106
- data/actions/tax_index.rb +0 -46
- data/actions/tax_set.rb +0 -63
- data/actions/tax_test.rb +0 -80
@@ -0,0 +1,343 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
require 'shellwords'
|
6
|
+
|
7
|
+
class MiGA::Cli::Action::Init < MiGA::Cli::Action
|
8
|
+
|
9
|
+
def parse_cli
|
10
|
+
cli.interactive = true
|
11
|
+
cli.defaults = {mytaxa: nil,
|
12
|
+
config: File.expand_path('.miga_modules', ENV['HOME']),
|
13
|
+
ask: false, auto: false, dtype: :bash}
|
14
|
+
cli.parse do |opt|
|
15
|
+
opt.on(
|
16
|
+
'-c', '--config PATH',
|
17
|
+
'Path to the Bash configuration file',
|
18
|
+
"By default: #{cli[:config]}"
|
19
|
+
){ |v| cli[:config] = v }
|
20
|
+
opt.on(
|
21
|
+
'--[no-]mytaxa',
|
22
|
+
'Should I try setting up MyTaxa its dependencies?',
|
23
|
+
'By default: interactive (true if --auto)'
|
24
|
+
){ |v| cli[:mytaxa] = v }
|
25
|
+
opt.on(
|
26
|
+
'--daemon-type STRING',
|
27
|
+
'Type of daemon launcher, one of: bash, qsub, msub, slurm',
|
28
|
+
"By default: interactive (#{cli[:dtype]} if --auto)"
|
29
|
+
){ |v| cli[:dtype] = v.to_sym }
|
30
|
+
opt.on(
|
31
|
+
'--ask-all',
|
32
|
+
'Ask for the location of all software',
|
33
|
+
'By default, only the locations missing in PATH are requested'
|
34
|
+
){ |v| cli[:ask] = v }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def perform
|
39
|
+
miga = MiGA.root_path
|
40
|
+
cli.puts <<BANNER
|
41
|
+
===[ Welcome to MiGA, the Microbial Genome Atlas ]===
|
42
|
+
|
43
|
+
I'm the initialization script, and I'll sniff around your computer to
|
44
|
+
make sure you have all the requirements for MiGA data processing.
|
45
|
+
|
46
|
+
BANNER
|
47
|
+
|
48
|
+
if cli.ask_user(
|
49
|
+
'Would you like to see all the requirements before starting?',
|
50
|
+
'no', %w(yes no)) == 'yes'
|
51
|
+
cli.puts ''
|
52
|
+
File.open(File.expand_path('utils/requirements.txt', miga), 'r') do |fh|
|
53
|
+
fh.each_line { |ln| cli.puts ln }
|
54
|
+
end
|
55
|
+
cli.puts ''
|
56
|
+
end
|
57
|
+
|
58
|
+
rc_path = File.expand_path('.miga_rc', ENV['HOME'])
|
59
|
+
if File.exist? rc_path
|
60
|
+
if cli.ask_user(
|
61
|
+
'I found a previous configuration. Do you want to continue?',
|
62
|
+
'yes', %w(yes no)) == 'no'
|
63
|
+
cli.puts 'OK, see you soon!'
|
64
|
+
exit(0)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
rc_fh = File.open(rc_path, 'w')
|
68
|
+
rc_fh.puts <<BASH
|
69
|
+
#!/bin/bash
|
70
|
+
# `miga init` made this on #{Time.now}
|
71
|
+
|
72
|
+
BASH
|
73
|
+
|
74
|
+
# Check bash configuration file
|
75
|
+
unless File.exist? cli[:config]
|
76
|
+
cli[:config] = cli.ask_user(
|
77
|
+
'Is there a script I need to load at startup?',
|
78
|
+
cli[:config])
|
79
|
+
end
|
80
|
+
if File.exist? cli[:config]
|
81
|
+
cli[:config] = File.expand_path(cli[:config])
|
82
|
+
cli.puts "Found bash configuration script: #{cli[:config]}"
|
83
|
+
rc_fh.puts "MIGA_STARTUP='#{cli[:config]}'"
|
84
|
+
rc_fh.puts '. "$MIGA_STARTUP"'
|
85
|
+
else
|
86
|
+
cli[:config] = '/dev/null'
|
87
|
+
end
|
88
|
+
cli.puts ''
|
89
|
+
|
90
|
+
# Check for software requirements
|
91
|
+
cli.puts 'Looking for requirements:'
|
92
|
+
if cli[:mytaxa].nil?
|
93
|
+
cli[:mytaxa] = cli.ask_user(
|
94
|
+
'Should I include MyTaxa modules?',
|
95
|
+
'yes', %w(yes no)) == 'yes'
|
96
|
+
end
|
97
|
+
rc_fh.puts 'export MIGA_MYTAXA="no"' unless cli[:mytaxa]
|
98
|
+
paths = {}
|
99
|
+
rc_fh.puts 'MIGA_PATH=""'
|
100
|
+
File.open(File.expand_path('utils/requirements.txt', miga), 'r') do |fh|
|
101
|
+
fh.each_line do |ln|
|
102
|
+
next if $. < 3
|
103
|
+
r = ln.chomp.split(/\t+/)
|
104
|
+
next if r[0] =~ /\(opt\)$/ && !cli[:mytaxa]
|
105
|
+
cli.print "Testing #{r[0]}#{" (#{r[3]})" if r[3]}... "
|
106
|
+
path = nil
|
107
|
+
loop do
|
108
|
+
d_path = File.dirname(run_cmd(cli, "which #{r[1].shellescape}"))
|
109
|
+
if cli[:ask] || d_path == '.'
|
110
|
+
path = cli.ask_user('Where can I find it?', d_path, nil, true)
|
111
|
+
else
|
112
|
+
path = d_path
|
113
|
+
cli.puts path
|
114
|
+
end
|
115
|
+
if File.executable?(File.expand_path(r[1], path))
|
116
|
+
if d_path != path
|
117
|
+
rc_fh.puts "MIGA_PATH=\"#{path}:$MIGA_PATH\" # #{r[1]}"
|
118
|
+
end
|
119
|
+
break
|
120
|
+
end
|
121
|
+
cli.print "I cannot find #{r[1]} "
|
122
|
+
end
|
123
|
+
paths[r[1]] = File.expand_path(r[1], path).shellescape
|
124
|
+
end
|
125
|
+
end
|
126
|
+
rc_fh.puts 'export PATH="$MIGA_PATH$PATH"'
|
127
|
+
cli.puts ''
|
128
|
+
|
129
|
+
# Check for other files
|
130
|
+
if cli[:mytaxa]
|
131
|
+
cli.puts 'Looking for MyTaxa databases:'
|
132
|
+
mt = File.dirname paths["MyTaxa"]
|
133
|
+
cli.print 'Looking for scores... '
|
134
|
+
unless Dir.exist?(File.expand_path('db', mt))
|
135
|
+
cli.puts "no.\nExecute 'python2 #{mt}/utils/download_db.py'."
|
136
|
+
exit(1)
|
137
|
+
end
|
138
|
+
cli.puts 'yes.'
|
139
|
+
cli.print 'Looking for diamond db... '
|
140
|
+
unless File.exist?(File.expand_path('AllGenomes.faa.dmnd', mt))
|
141
|
+
cli.puts "no.\nDownload " \
|
142
|
+
"'http://enve-omics.ce.gatech.edu/data/public_mytaxa/" \
|
143
|
+
"AllGenomes.faa.dmnd' into #{mt}."
|
144
|
+
exit(1)
|
145
|
+
end
|
146
|
+
cli.puts ''
|
147
|
+
end
|
148
|
+
|
149
|
+
# Check for R packages
|
150
|
+
cli.puts 'Looking for R packages:'
|
151
|
+
%w(enveomics.R ape cluster vegan).each do |pkg|
|
152
|
+
cli.print "Testing #{pkg}... "
|
153
|
+
if test_r_package(cli, paths, pkg)
|
154
|
+
cli.puts 'yes.'
|
155
|
+
else
|
156
|
+
cli.puts 'no, installing'
|
157
|
+
cli.print '' + install_r_package(cli, paths, pkg)
|
158
|
+
unless test_r_package(cli, paths, pkg)
|
159
|
+
raise "Unable to auto-install R package: #{pkg}"
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
cli.puts ''
|
164
|
+
|
165
|
+
# Check for Ruby gems
|
166
|
+
cli.puts 'Looking for Ruby gems:'
|
167
|
+
%w(sqlite3 daemons json).each do |pkg|
|
168
|
+
cli.print "Testing #{pkg}... "
|
169
|
+
if test_ruby_gem(cli, paths, pkg)
|
170
|
+
cli.puts 'yes.'
|
171
|
+
else
|
172
|
+
cli.puts 'no, installing'
|
173
|
+
# This hackey mess is meant to ensure the test and installation are done
|
174
|
+
# on the configuration Ruby, not on the Ruby currently executing the init
|
175
|
+
# action
|
176
|
+
cli.print install_ruby_gem(cli, paths, pkg)
|
177
|
+
unless test_ruby_gem(cli, paths, pkg)
|
178
|
+
raise "Unable to auto-install Ruby gem: #{pkg}"
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
cli.puts ''
|
183
|
+
|
184
|
+
# Configure daemon
|
185
|
+
cli.puts 'Default daemon configuration:'
|
186
|
+
daemon_f = File.expand_path('.miga_daemon.json', ENV['HOME'])
|
187
|
+
unless File.exist?(daemon_f) and cli.ask_user(
|
188
|
+
'A template daemon already exists, do you want to preserve it?',
|
189
|
+
'yes', %w(yes no)) == 'yes'
|
190
|
+
v = {created: Time.now.to_s, updated: Time.now.to_s}
|
191
|
+
v[:type] = cli.ask_user(
|
192
|
+
'Please select the type of daemon you want to setup',
|
193
|
+
cli[:dtype], %w(bash qsub msub slurm))
|
194
|
+
case v[:type]
|
195
|
+
when 'bash'
|
196
|
+
v[:latency] = cli.ask_user(
|
197
|
+
'How long should I sleep? (in seconds)', '30').to_i
|
198
|
+
v[:maxjobs] = cli.ask_user(
|
199
|
+
'How many jobs can I launch at once?', '6').to_i
|
200
|
+
v[:ppn] = cli.ask_user(
|
201
|
+
'How many CPUs can I use per job?', '2').to_i
|
202
|
+
cli.puts 'Setting up internal daemon defaults.'
|
203
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
204
|
+
v[:cmd] = cli.ask_user(
|
205
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
206
|
+
"%2$s: variables, %3$d: CPUs, %4$s: log file, %5$s: task name.\n",
|
207
|
+
"%2$s '%1$s' > '%4$s' 2>&1")
|
208
|
+
v[:var] = cli.ask_user(
|
209
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
210
|
+
"%1$s=%2$s")
|
211
|
+
v[:varsep] = cli.ask_user(
|
212
|
+
'What should I use to separate variables?', ' ')
|
213
|
+
v[:alive] = cli.ask_user(
|
214
|
+
"How can I know that a process is still alive?\n %1$s: PID, " \
|
215
|
+
"output should be 1 for running and 0 for non-running.\n",
|
216
|
+
"ps -p '%1$s'|tail -n+2|wc -l")
|
217
|
+
v[:kill] = cli.ask_user(
|
218
|
+
"How should I terminate tasks?\n %s: process ID.", "kill -9 '%s'")
|
219
|
+
when 'slurm'
|
220
|
+
queue = cli.ask_user(
|
221
|
+
'What queue should I use?', nil, nil, true)
|
222
|
+
v[:latency] = cli.ask_user(
|
223
|
+
'How long should I sleep? (in seconds)', '150').to_i
|
224
|
+
v[:maxjobs] = cli.ask_user(
|
225
|
+
'How many jobs can I launch at once?', '300').to_i
|
226
|
+
v[:ppn] = cli.ask_user(
|
227
|
+
'How many CPUs can I use per job?', '2').to_i
|
228
|
+
cli.puts 'Setting up internal daemon defaults'
|
229
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
230
|
+
v[:cmd] = cli.ask_user(
|
231
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
232
|
+
"%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
|
233
|
+
"%2$s sbatch --partition='#{queue}' --export=ALL " \
|
234
|
+
"--nodes=1 --ntasks-per-node=%3$d --output='%4$s' " \
|
235
|
+
"--job-name='%5$s' --mem=9G --time=12:00:00 %1$s " \
|
236
|
+
"| perl -pe 's/.* //'")
|
237
|
+
v[:var] = cli.ask_user(
|
238
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
239
|
+
"%1$s=%2$s")
|
240
|
+
v[:varsep] = cli.ask_user(
|
241
|
+
'What should I use to separate variables?', ' ')
|
242
|
+
v[:alive] = cli.ask_user(
|
243
|
+
"How can I know that a process is still alive?\n %1$s: job id, " \
|
244
|
+
"output should be 1 for running and 0 for non-running.\n",
|
245
|
+
"squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " \
|
246
|
+
"| tail -n 1 | wc -l")
|
247
|
+
v[:kill] = cli.ask_user(
|
248
|
+
"How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
|
249
|
+
else # [qm]sub
|
250
|
+
queue = cli.ask_user('What queue should I use?', nil, nil, true)
|
251
|
+
v[:latency] = cli.ask_user(
|
252
|
+
'How long should I sleep? (in seconds)', '150').to_i
|
253
|
+
v[:maxjobs] = cli.ask_user(
|
254
|
+
'How many jobs can I launch at once?', '300').to_i
|
255
|
+
v[:ppn] = cli.ask_user(
|
256
|
+
'How many CPUs can I use per job?', '2').to_i
|
257
|
+
cli.puts 'Setting up internal daemon defaults.'
|
258
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
259
|
+
v[:cmd] = cli.ask_user(
|
260
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
261
|
+
"%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
|
262
|
+
"#{v[:type]} -q '#{queue}' -v '%2$s' -l nodes=1:ppn=%3$d %1$s " \
|
263
|
+
"-j oe -o '%4$s' -N '%5$s' -l mem=9g -l walltime=12:00:00 " \
|
264
|
+
"| grep .")
|
265
|
+
v[:var] = cli.ask_user(
|
266
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
267
|
+
"%1$s=%2$s")
|
268
|
+
v[:varsep] = cli.ask_user(
|
269
|
+
'What should I use to separate variables?', ',')
|
270
|
+
if v[:type] == 'qsub'
|
271
|
+
v[:alive] = cli.ask_user(
|
272
|
+
"How can I know that a process is still alive?\n " \
|
273
|
+
"%1$s: job id, output should be 1 for running and " \
|
274
|
+
"0 for non-running.\n",
|
275
|
+
"qstat -f '%1$s'|grep ' job_state ='|perl -pe 's/.*= //'" \
|
276
|
+
"|grep '[^C]'|tail -n1|wc -l|awk '{print $1}'")
|
277
|
+
v[:kill] = cli.ask_user(
|
278
|
+
"How should I terminate tasks?\n %s: process ID.", "qdel '%s'")
|
279
|
+
else # msub
|
280
|
+
v[:alive] = cli.ask_user(
|
281
|
+
"How can I know that a process is still alive?\n " \
|
282
|
+
"%1$s: job id, output should be 1 for running and " \
|
283
|
+
"0 for non-running.\n",
|
284
|
+
"checkjob '%1$s'|grep '^State:'|perl -pe 's/.*: //'" \
|
285
|
+
"|grep 'Deferred\\|Hold\\|Idle\\|Starting\\|Running\\|Blocked'" \
|
286
|
+
"|tail -n1|wc -l|awk '{print $1}'")
|
287
|
+
v[:kill] = cli.ask_user(
|
288
|
+
"How should I terminate tasks?\n %s: process ID.",
|
289
|
+
"canceljob '%s'")
|
290
|
+
end
|
291
|
+
end
|
292
|
+
File.open(daemon_f, 'w') { |fh| fh.puts JSON.pretty_generate(v) }
|
293
|
+
end
|
294
|
+
cli.puts ''
|
295
|
+
|
296
|
+
rc_fh.puts <<FOOT
|
297
|
+
|
298
|
+
MIGA_CONFIG_VERSION='#{MiGA::MiGA.VERSION}'
|
299
|
+
MIGA_CONFIG_LONGVERSION='#{MiGA::MiGA.LONG_VERSION}'
|
300
|
+
MIGA_CONFIG_DATE='#{Time.now}'
|
301
|
+
|
302
|
+
FOOT
|
303
|
+
|
304
|
+
cli.puts 'Configuration complete. MiGA is ready to work!'
|
305
|
+
cli.puts ''
|
306
|
+
|
307
|
+
end
|
308
|
+
|
309
|
+
def empty_action
|
310
|
+
end
|
311
|
+
|
312
|
+
def run_cmd(cli, cmd)
|
313
|
+
`. "#{cli[:config]}" && #{cmd}`
|
314
|
+
end
|
315
|
+
|
316
|
+
def run_r_cmd(cli, paths, cmd)
|
317
|
+
run_cmd(cli,
|
318
|
+
"echo #{cmd.shellescape} | #{paths['R'].shellescape} --vanilla -q 2>&1")
|
319
|
+
end
|
320
|
+
|
321
|
+
def test_r_package(cli, paths, pkg)
|
322
|
+
run_r_cmd(cli, paths, "library('#{pkg}')")
|
323
|
+
$?.success?
|
324
|
+
end
|
325
|
+
|
326
|
+
def install_r_package(cli, paths, pkg)
|
327
|
+
r_cmd = "install.packages('#{pkg}', repos='http://cran.rstudio.com/')"
|
328
|
+
run_r_cmd(cli, paths, r_cmd)
|
329
|
+
end
|
330
|
+
|
331
|
+
def test_ruby_gem(cli, paths, pkg)
|
332
|
+
run_cmd(cli,
|
333
|
+
"#{paths['ruby'].shellescape} -r #{pkg.shellescape} -e '' 2>/dev/null")
|
334
|
+
$?.success?
|
335
|
+
end
|
336
|
+
|
337
|
+
def install_ruby_gem(cli, paths, pkg)
|
338
|
+
gem_cmd = "Gem::GemRunner.new.run %w(install --user #{pkg})"
|
339
|
+
run_cmd(cli, "#{paths['ruby'].shellescape} \
|
340
|
+
-r rubygems -r rubygems/gem_runner \
|
341
|
+
-e #{gem_cmd.shellescape} 2>&1")
|
342
|
+
end
|
343
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
|
6
|
+
class MiGA::Cli::Action::Ln < MiGA::Cli::Action
|
7
|
+
|
8
|
+
def parse_cli
|
9
|
+
cli.defaults = {info: false, force: false, method: :hardlink}
|
10
|
+
cli.parse do |opt|
|
11
|
+
cli.opt_object(opt, [:project, :dataset_opt])
|
12
|
+
opt.on(
|
13
|
+
'-Q', '--project-target PATH',
|
14
|
+
'(Mandatory) Path to the project where to link the dataset'
|
15
|
+
){ |v| cli[:project2] = v }
|
16
|
+
opt.on(
|
17
|
+
'-f', '--force',
|
18
|
+
'Force linking, even if dataset\'s preprocessing is incomplete'
|
19
|
+
){ |v| cli[:force] = v }
|
20
|
+
opt.on(
|
21
|
+
'-s', '--symlink',
|
22
|
+
'Create symlinks instead of the default hard links'
|
23
|
+
){ cli[:method] = :symlink }
|
24
|
+
opt.on(
|
25
|
+
'-c', '--copy',
|
26
|
+
'Create copies instead of the default hard links'
|
27
|
+
){ cli[:method] = :copy }
|
28
|
+
cli.opt_filter_datasets(opt)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def perform
|
33
|
+
p = cli.load_project
|
34
|
+
q = cli.load_project(:project2, '-Q')
|
35
|
+
ds = cli.load_and_filter_datasets
|
36
|
+
ds.each do |d|
|
37
|
+
next unless cli[:force] or d.done_preprocessing?
|
38
|
+
cli.puts d.name
|
39
|
+
q.import_dataset(d, cli[:method])
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
|
6
|
+
class MiGA::Cli::Action::Ls < MiGA::Cli::Action
|
7
|
+
|
8
|
+
def parse_cli
|
9
|
+
cli.defaults = {info: false, processing: false, silent: false}
|
10
|
+
cli.parse do |opt|
|
11
|
+
cli.opt_object(opt, [:project, :dataset_opt])
|
12
|
+
cli.opt_filter_datasets(opt)
|
13
|
+
opt.on(
|
14
|
+
'-i', '--info',
|
15
|
+
'Print additional information on each dataset'
|
16
|
+
){ |v| cli[:info] = v }
|
17
|
+
opt.on(
|
18
|
+
'-p', '--processing',
|
19
|
+
'Print information on processing advance'
|
20
|
+
){ |v| cli[:processing] = v }
|
21
|
+
opt.on(
|
22
|
+
'-m', '--metadata STRING',
|
23
|
+
'Print name and metadata field only',
|
24
|
+
'If set, ignores -i and assumes --tab'
|
25
|
+
){ |v| cli[:datum] = v }
|
26
|
+
opt.on(
|
27
|
+
'--tab',
|
28
|
+
'Return a tab-delimited table'
|
29
|
+
){ |v| cli[:tabular] = v }
|
30
|
+
opt.on(
|
31
|
+
'-s', '--silent',
|
32
|
+
'No output and exit with non-zero status if the dataset list is empty'
|
33
|
+
){ |v| cli[:silent] = v }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def perform
|
38
|
+
ds = cli.load_and_filter_datasets(cli[:silent])
|
39
|
+
exit(ds.empty? ? 1 : 0) if cli[:silent]
|
40
|
+
if !cli[:datum].nil?
|
41
|
+
ds.each do |d|
|
42
|
+
v = d.metadata[cli[:datum]]
|
43
|
+
puts "#{d.name}\t#{v.nil? ? '?' : v}"
|
44
|
+
end
|
45
|
+
elsif cli[:info]
|
46
|
+
cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info })
|
47
|
+
elsif cli[:processing]
|
48
|
+
comp = %w[- done queued]
|
49
|
+
cli.table([:name] + MiGA::Dataset.PREPROCESSING_TASKS,
|
50
|
+
ds.map { |d| [d.name] + d.profile_advance.map { |i| comp[i] } })
|
51
|
+
else
|
52
|
+
ds.each { |d| cli.puts d.name }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,218 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
require 'miga/remote_dataset'
|
6
|
+
require 'csv'
|
7
|
+
|
8
|
+
class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
9
|
+
|
10
|
+
def parse_cli
|
11
|
+
cli.defaults = {query: false, unlink: false,
|
12
|
+
reference: false, legacy_name: false,
|
13
|
+
complete: false, chromosome: false,
|
14
|
+
scaffold: false, contig: false, add_version: true, dry: false,
|
15
|
+
get_md: false, only_md: false, save_every: 1}
|
16
|
+
cli.parse do |opt|
|
17
|
+
cli.opt_object(opt, [:project])
|
18
|
+
opt.on(
|
19
|
+
'-T', '--taxon STRING',
|
20
|
+
'(Mandatory unless --reference) Taxon name (e.g., a species binomial)'
|
21
|
+
){ |v| cli[:taxon] = v }
|
22
|
+
opt.on('--reference',
|
23
|
+
'Download all reference genomes (ignore any other status)'
|
24
|
+
){ |v| cli[:reference] = v }
|
25
|
+
opt.on(
|
26
|
+
'--complete',
|
27
|
+
'Download complete genomes'
|
28
|
+
){ |v| cli[:complete] = v }
|
29
|
+
opt.on('--chromosome',
|
30
|
+
'Download complete chromosomes'
|
31
|
+
){ |v| cli[:chromosome] = v }
|
32
|
+
opt.on(
|
33
|
+
'--scaffold',
|
34
|
+
'Download genomes in scaffolds'
|
35
|
+
){ |v| cli[:scaffold] = v }
|
36
|
+
opt.on(
|
37
|
+
'--contig',
|
38
|
+
'Download genomes in contigs'
|
39
|
+
){ |v| cli[:contig] = v }
|
40
|
+
opt.on(
|
41
|
+
'--all',
|
42
|
+
'Download all genomes (in any status)') do
|
43
|
+
cli[:complete] = true
|
44
|
+
cli[:chromosome] = true
|
45
|
+
cli[:scaffold] = true
|
46
|
+
cli[:contig] = true
|
47
|
+
end
|
48
|
+
opt.on(
|
49
|
+
'--no-version-name',
|
50
|
+
'Do not add sequence version to the dataset name',
|
51
|
+
'Only affects --complete and --chromosome'
|
52
|
+
){ |v| cli[:add_version] = v }
|
53
|
+
opt.on(
|
54
|
+
'--legacy-name',
|
55
|
+
'Use dataset names based on chromosome entries instead of assembly'
|
56
|
+
){ |v| cli[:legacy_name] = v }
|
57
|
+
opt.on('--blacklist PATH',
|
58
|
+
'A file with dataset names to blacklist'
|
59
|
+
){ |v| cli[:blacklist] = v }
|
60
|
+
opt.on(
|
61
|
+
'--dry',
|
62
|
+
'Do not download or save the datasets'
|
63
|
+
){ |v| cli[:dry] = v }
|
64
|
+
opt.on(
|
65
|
+
'--get-metadata',
|
66
|
+
'Only download and update metadata for existing datasets'
|
67
|
+
){ |v| cli[:get_md] = v }
|
68
|
+
opt.on('--only-metadata',
|
69
|
+
'Create datasets without input data but retrieve all metadata'
|
70
|
+
){ |v| cli[:only_md] = v }
|
71
|
+
opt.on(
|
72
|
+
'--save-every INT', Integer,
|
73
|
+
'Save project every this many downloaded datasets',
|
74
|
+
'If zero, it saves the project only once upon completion',
|
75
|
+
"By default: #{cli[:save_every]}"
|
76
|
+
){ |v| cli[:save_every] = v }
|
77
|
+
opt.on(
|
78
|
+
'-q', '--query',
|
79
|
+
'Register the datasets as queries, not reference datasets'
|
80
|
+
){ |v| cli[:query] = v }
|
81
|
+
opt.on(
|
82
|
+
'-u', '--unlink',
|
83
|
+
'Unlink all datasets in the project missing from the download list'
|
84
|
+
){ |v| cli[:unlink] = v }
|
85
|
+
opt.on('-R', '--remote-list PATH',
|
86
|
+
'Path to an output file with the list of all datasets listed remotely'
|
87
|
+
){ |v| cli[:remote_list] = v }
|
88
|
+
opt.on(
|
89
|
+
'--api-key STRING',
|
90
|
+
'NCBI API key'
|
91
|
+
){ |v| ENV['NCBI_API_KEY'] = v }
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def perform
|
96
|
+
cli.ensure_par(taxon: '-T') unless cli[:reference]
|
97
|
+
unless %w[reference complete chromosome scaffold contig].any?{ |i| cli[i.to_sym] }
|
98
|
+
raise 'No action requested: pick at least one type of genome'
|
99
|
+
end
|
100
|
+
cli[:save_every] = 1 if cli[:dry]
|
101
|
+
|
102
|
+
p = cli.load_project
|
103
|
+
d = []
|
104
|
+
ds = {}
|
105
|
+
downloaded = 0
|
106
|
+
|
107
|
+
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
|
108
|
+
url_param = {
|
109
|
+
q: '[display()].' +
|
110
|
+
'from(GenomeAssemblies).' +
|
111
|
+
'usingschema(/schema/GenomeAssemblies).' +
|
112
|
+
'matching(tab==["Prokaryotes"] and q=="' + cli[:taxon].tr('"',"'") + '"',
|
113
|
+
fields: 'organism|organism,assembly|assembly,replicons|replicons,' +
|
114
|
+
'level|level,ftp_path_genbank|ftp_path_genbank,release_date|release_date,' +
|
115
|
+
'strain|strain',
|
116
|
+
nolimit: 'on',
|
117
|
+
}
|
118
|
+
if cli[:reference]
|
119
|
+
url_param[:q] += ' and refseq_category==["representative"]'
|
120
|
+
else
|
121
|
+
status = {
|
122
|
+
complete: 'Complete',
|
123
|
+
chromosome: ' Chromosome', # <- The leading space is *VERY* important!
|
124
|
+
scaffold: 'Scaffold',
|
125
|
+
contig: 'Contig'
|
126
|
+
}.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
|
127
|
+
url_param[:q] += ' and level==[' + status + ']'
|
128
|
+
end
|
129
|
+
url_param[:q] += ')'
|
130
|
+
url = url_base + URI.encode_www_form(url_param)
|
131
|
+
cli.say 'Downloading genome list'
|
132
|
+
lineno = 0
|
133
|
+
doc = RemoteDataset.download_url(url)
|
134
|
+
CSV.parse(doc, headers: true).each do |r|
|
135
|
+
asm = r['assembly']
|
136
|
+
next if asm.nil? or asm.empty? or asm == '-'
|
137
|
+
next unless r['ftp_path_genbank']
|
138
|
+
|
139
|
+
# Get replicons
|
140
|
+
rep = r['replicons'].nil? ? nil : r['replicons'].
|
141
|
+
split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
|
142
|
+
|
143
|
+
# Set name
|
144
|
+
if cli[:legacy_name] and cli[:reference]
|
145
|
+
n = r['#organism'].miga_name
|
146
|
+
else
|
147
|
+
if cli[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
|
148
|
+
acc = rep.nil? ? '' : rep.first
|
149
|
+
else
|
150
|
+
acc = asm
|
151
|
+
end
|
152
|
+
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
153
|
+
n = "#{r['#organism']}_#{acc}".miga_name
|
154
|
+
end
|
155
|
+
|
156
|
+
# Register for download
|
157
|
+
fna_url = r['ftp_path_genbank'] + '/' +
|
158
|
+
File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
|
159
|
+
ds[n] = {
|
160
|
+
ids: [fna_url], db: :assembly_gz, universe: :web,
|
161
|
+
md: {
|
162
|
+
type: :genome, ncbi_asm: asm, strain: r['strain']
|
163
|
+
}
|
164
|
+
}
|
165
|
+
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
166
|
+
ds[n][:md][:release_date] =
|
167
|
+
Time.parse(r['release_date']).to_s unless r['release_date'].nil?
|
168
|
+
end
|
169
|
+
|
170
|
+
# Discard blacklisted
|
171
|
+
unless cli[:blacklist].nil?
|
172
|
+
cli.say "Discarding datasets in #{cli[:blacklist]}"
|
173
|
+
File.readlines(cli[:blacklist]).
|
174
|
+
select{ |i| i !~ /^#/ }.map(&:chomp).each{ |i| ds.delete i }
|
175
|
+
end
|
176
|
+
|
177
|
+
# Download entries
|
178
|
+
cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
|
179
|
+
p.do_not_save = true if cli[:save_every] != 1
|
180
|
+
ds.each do |name, body|
|
181
|
+
d << name
|
182
|
+
cli.puts name
|
183
|
+
next if p.dataset(name).nil? == cli[:get_md]
|
184
|
+
downloaded += 1
|
185
|
+
next if cli[:dry]
|
186
|
+
cli.say ' Locating remote dataset'
|
187
|
+
body[:md][:metadata_only] = true if cli[:only_md]
|
188
|
+
rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
189
|
+
if cli[:get_md]
|
190
|
+
cli.say ' Updating dataset'
|
191
|
+
rd.update_metadata(p.dataset(name), body[:md])
|
192
|
+
else
|
193
|
+
cli.say ' Creating dataset'
|
194
|
+
rd.save_to(p, name, !cli[:query], body[:md])
|
195
|
+
p.add_dataset(name)
|
196
|
+
end
|
197
|
+
p.save! if cli[:save_every] > 1 and (downloaded % cli[:save_every]) == 0
|
198
|
+
end
|
199
|
+
|
200
|
+
p.do_not_save = false
|
201
|
+
p.save! if cli[:save_every] != 1
|
202
|
+
|
203
|
+
# Finalize
|
204
|
+
cli.say "Datasets listed: #{d.size}"
|
205
|
+
cli.say "Datasets #{cli[:dry] ? 'to download' : 'downloaded'}: " +
|
206
|
+
downloaded.to_s
|
207
|
+
unless cli[:remote_list].nil?
|
208
|
+
File.open(cli[:remote_list], 'w') do |fh|
|
209
|
+
d.each { |i| fh.puts i }
|
210
|
+
end
|
211
|
+
end
|
212
|
+
if cli[:unlink]
|
213
|
+
unlink = p.dataset_names - d
|
214
|
+
unlink.each { |i| p.unlink_dataset(i).remove! }
|
215
|
+
cli.say "Datasets unlinked: #{unlink.size}"
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|