miga-base 0.4.1.0 → 0.4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/miga +2 -244
- data/lib/miga/cli/action/about.rb +44 -0
- data/lib/miga/cli/action/add.rb +139 -0
- data/lib/miga/cli/action/add_result.rb +26 -0
- data/lib/miga/cli/action/console.rb +19 -0
- data/lib/miga/cli/action/daemon.rb +74 -0
- data/lib/miga/cli/action/date.rb +18 -0
- data/lib/miga/cli/action/doctor.rb +210 -0
- data/lib/miga/cli/action/edit.rb +24 -0
- data/lib/miga/cli/action/files.rb +31 -0
- data/lib/miga/cli/action/find.rb +48 -0
- data/lib/miga/cli/action/generic.rb +44 -0
- data/lib/miga/cli/action/get.rb +132 -0
- data/lib/miga/cli/action/init.rb +343 -0
- data/lib/miga/cli/action/ln.rb +42 -0
- data/lib/miga/cli/action/ls.rb +55 -0
- data/lib/miga/cli/action/ncbi_get.rb +218 -0
- data/lib/miga/cli/action/new.rb +45 -0
- data/lib/miga/cli/action/next_step.rb +27 -0
- data/lib/miga/cli/action/plugins.rb +28 -0
- data/lib/miga/cli/action/rm.rb +25 -0
- data/lib/miga/cli/action/run.rb +39 -0
- data/lib/miga/cli/action/stats.rb +140 -0
- data/lib/miga/cli/action/summary.rb +49 -0
- data/lib/miga/cli/action/tax_dist.rb +102 -0
- data/lib/miga/cli/action/tax_index.rb +47 -0
- data/lib/miga/cli/action/tax_set.rb +59 -0
- data/lib/miga/cli/action/tax_test.rb +77 -0
- data/lib/miga/cli/action.rb +66 -0
- data/lib/miga/cli/base.rb +90 -0
- data/lib/miga/cli.rb +426 -0
- data/lib/miga/project/result.rb +14 -6
- data/lib/miga/remote_dataset.rb +1 -1
- data/lib/miga/tax_index.rb +5 -4
- data/lib/miga/taxonomy/base.rb +63 -0
- data/lib/miga/taxonomy.rb +87 -92
- data/lib/miga/version.rb +6 -6
- data/test/taxonomy_test.rb +49 -9
- data/utils/distance/commands.rb +11 -11
- data/utils/distance/pipeline.rb +5 -5
- metadata +43 -49
- data/actions/about.rb +0 -43
- data/actions/add.rb +0 -129
- data/actions/add_result.rb +0 -30
- data/actions/daemon.rb +0 -55
- data/actions/date.rb +0 -14
- data/actions/doctor.rb +0 -201
- data/actions/edit.rb +0 -33
- data/actions/files.rb +0 -43
- data/actions/find.rb +0 -41
- data/actions/get.rb +0 -105
- data/actions/init.rb +0 -301
- data/actions/ln.rb +0 -47
- data/actions/ls.rb +0 -61
- data/actions/ncbi_get.rb +0 -192
- data/actions/new.rb +0 -44
- data/actions/next_step.rb +0 -33
- data/actions/plugins.rb +0 -25
- data/actions/rm.rb +0 -29
- data/actions/run.rb +0 -45
- data/actions/stats.rb +0 -149
- data/actions/summary.rb +0 -57
- data/actions/tax_dist.rb +0 -106
- data/actions/tax_index.rb +0 -46
- data/actions/tax_set.rb +0 -63
- data/actions/tax_test.rb +0 -80
@@ -0,0 +1,343 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
require 'shellwords'
|
6
|
+
|
7
|
+
class MiGA::Cli::Action::Init < MiGA::Cli::Action
|
8
|
+
|
9
|
+
def parse_cli
|
10
|
+
cli.interactive = true
|
11
|
+
cli.defaults = {mytaxa: nil,
|
12
|
+
config: File.expand_path('.miga_modules', ENV['HOME']),
|
13
|
+
ask: false, auto: false, dtype: :bash}
|
14
|
+
cli.parse do |opt|
|
15
|
+
opt.on(
|
16
|
+
'-c', '--config PATH',
|
17
|
+
'Path to the Bash configuration file',
|
18
|
+
"By default: #{cli[:config]}"
|
19
|
+
){ |v| cli[:config] = v }
|
20
|
+
opt.on(
|
21
|
+
'--[no-]mytaxa',
|
22
|
+
'Should I try setting up MyTaxa its dependencies?',
|
23
|
+
'By default: interactive (true if --auto)'
|
24
|
+
){ |v| cli[:mytaxa] = v }
|
25
|
+
opt.on(
|
26
|
+
'--daemon-type STRING',
|
27
|
+
'Type of daemon launcher, one of: bash, qsub, msub, slurm',
|
28
|
+
"By default: interactive (#{cli[:dtype]} if --auto)"
|
29
|
+
){ |v| cli[:dtype] = v.to_sym }
|
30
|
+
opt.on(
|
31
|
+
'--ask-all',
|
32
|
+
'Ask for the location of all software',
|
33
|
+
'By default, only the locations missing in PATH are requested'
|
34
|
+
){ |v| cli[:ask] = v }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def perform
|
39
|
+
miga = MiGA.root_path
|
40
|
+
cli.puts <<BANNER
|
41
|
+
===[ Welcome to MiGA, the Microbial Genome Atlas ]===
|
42
|
+
|
43
|
+
I'm the initialization script, and I'll sniff around your computer to
|
44
|
+
make sure you have all the requirements for MiGA data processing.
|
45
|
+
|
46
|
+
BANNER
|
47
|
+
|
48
|
+
if cli.ask_user(
|
49
|
+
'Would you like to see all the requirements before starting?',
|
50
|
+
'no', %w(yes no)) == 'yes'
|
51
|
+
cli.puts ''
|
52
|
+
File.open(File.expand_path('utils/requirements.txt', miga), 'r') do |fh|
|
53
|
+
fh.each_line { |ln| cli.puts ln }
|
54
|
+
end
|
55
|
+
cli.puts ''
|
56
|
+
end
|
57
|
+
|
58
|
+
rc_path = File.expand_path('.miga_rc', ENV['HOME'])
|
59
|
+
if File.exist? rc_path
|
60
|
+
if cli.ask_user(
|
61
|
+
'I found a previous configuration. Do you want to continue?',
|
62
|
+
'yes', %w(yes no)) == 'no'
|
63
|
+
cli.puts 'OK, see you soon!'
|
64
|
+
exit(0)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
rc_fh = File.open(rc_path, 'w')
|
68
|
+
rc_fh.puts <<BASH
|
69
|
+
#!/bin/bash
|
70
|
+
# `miga init` made this on #{Time.now}
|
71
|
+
|
72
|
+
BASH
|
73
|
+
|
74
|
+
# Check bash configuration file
|
75
|
+
unless File.exist? cli[:config]
|
76
|
+
cli[:config] = cli.ask_user(
|
77
|
+
'Is there a script I need to load at startup?',
|
78
|
+
cli[:config])
|
79
|
+
end
|
80
|
+
if File.exist? cli[:config]
|
81
|
+
cli[:config] = File.expand_path(cli[:config])
|
82
|
+
cli.puts "Found bash configuration script: #{cli[:config]}"
|
83
|
+
rc_fh.puts "MIGA_STARTUP='#{cli[:config]}'"
|
84
|
+
rc_fh.puts '. "$MIGA_STARTUP"'
|
85
|
+
else
|
86
|
+
cli[:config] = '/dev/null'
|
87
|
+
end
|
88
|
+
cli.puts ''
|
89
|
+
|
90
|
+
# Check for software requirements
|
91
|
+
cli.puts 'Looking for requirements:'
|
92
|
+
if cli[:mytaxa].nil?
|
93
|
+
cli[:mytaxa] = cli.ask_user(
|
94
|
+
'Should I include MyTaxa modules?',
|
95
|
+
'yes', %w(yes no)) == 'yes'
|
96
|
+
end
|
97
|
+
rc_fh.puts 'export MIGA_MYTAXA="no"' unless cli[:mytaxa]
|
98
|
+
paths = {}
|
99
|
+
rc_fh.puts 'MIGA_PATH=""'
|
100
|
+
File.open(File.expand_path('utils/requirements.txt', miga), 'r') do |fh|
|
101
|
+
fh.each_line do |ln|
|
102
|
+
next if $. < 3
|
103
|
+
r = ln.chomp.split(/\t+/)
|
104
|
+
next if r[0] =~ /\(opt\)$/ && !cli[:mytaxa]
|
105
|
+
cli.print "Testing #{r[0]}#{" (#{r[3]})" if r[3]}... "
|
106
|
+
path = nil
|
107
|
+
loop do
|
108
|
+
d_path = File.dirname(run_cmd(cli, "which #{r[1].shellescape}"))
|
109
|
+
if cli[:ask] || d_path == '.'
|
110
|
+
path = cli.ask_user('Where can I find it?', d_path, nil, true)
|
111
|
+
else
|
112
|
+
path = d_path
|
113
|
+
cli.puts path
|
114
|
+
end
|
115
|
+
if File.executable?(File.expand_path(r[1], path))
|
116
|
+
if d_path != path
|
117
|
+
rc_fh.puts "MIGA_PATH=\"#{path}:$MIGA_PATH\" # #{r[1]}"
|
118
|
+
end
|
119
|
+
break
|
120
|
+
end
|
121
|
+
cli.print "I cannot find #{r[1]} "
|
122
|
+
end
|
123
|
+
paths[r[1]] = File.expand_path(r[1], path).shellescape
|
124
|
+
end
|
125
|
+
end
|
126
|
+
rc_fh.puts 'export PATH="$MIGA_PATH$PATH"'
|
127
|
+
cli.puts ''
|
128
|
+
|
129
|
+
# Check for other files
|
130
|
+
if cli[:mytaxa]
|
131
|
+
cli.puts 'Looking for MyTaxa databases:'
|
132
|
+
mt = File.dirname paths["MyTaxa"]
|
133
|
+
cli.print 'Looking for scores... '
|
134
|
+
unless Dir.exist?(File.expand_path('db', mt))
|
135
|
+
cli.puts "no.\nExecute 'python2 #{mt}/utils/download_db.py'."
|
136
|
+
exit(1)
|
137
|
+
end
|
138
|
+
cli.puts 'yes.'
|
139
|
+
cli.print 'Looking for diamond db... '
|
140
|
+
unless File.exist?(File.expand_path('AllGenomes.faa.dmnd', mt))
|
141
|
+
cli.puts "no.\nDownload " \
|
142
|
+
"'http://enve-omics.ce.gatech.edu/data/public_mytaxa/" \
|
143
|
+
"AllGenomes.faa.dmnd' into #{mt}."
|
144
|
+
exit(1)
|
145
|
+
end
|
146
|
+
cli.puts ''
|
147
|
+
end
|
148
|
+
|
149
|
+
# Check for R packages
|
150
|
+
cli.puts 'Looking for R packages:'
|
151
|
+
%w(enveomics.R ape cluster vegan).each do |pkg|
|
152
|
+
cli.print "Testing #{pkg}... "
|
153
|
+
if test_r_package(cli, paths, pkg)
|
154
|
+
cli.puts 'yes.'
|
155
|
+
else
|
156
|
+
cli.puts 'no, installing'
|
157
|
+
cli.print '' + install_r_package(cli, paths, pkg)
|
158
|
+
unless test_r_package(cli, paths, pkg)
|
159
|
+
raise "Unable to auto-install R package: #{pkg}"
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
cli.puts ''
|
164
|
+
|
165
|
+
# Check for Ruby gems
|
166
|
+
cli.puts 'Looking for Ruby gems:'
|
167
|
+
%w(sqlite3 daemons json).each do |pkg|
|
168
|
+
cli.print "Testing #{pkg}... "
|
169
|
+
if test_ruby_gem(cli, paths, pkg)
|
170
|
+
cli.puts 'yes.'
|
171
|
+
else
|
172
|
+
cli.puts 'no, installing'
|
173
|
+
# This hackey mess is meant to ensure the test and installation are done
|
174
|
+
# on the configuration Ruby, not on the Ruby currently executing the init
|
175
|
+
# action
|
176
|
+
cli.print install_ruby_gem(cli, paths, pkg)
|
177
|
+
unless test_ruby_gem(cli, paths, pkg)
|
178
|
+
raise "Unable to auto-install Ruby gem: #{pkg}"
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
cli.puts ''
|
183
|
+
|
184
|
+
# Configure daemon
|
185
|
+
cli.puts 'Default daemon configuration:'
|
186
|
+
daemon_f = File.expand_path('.miga_daemon.json', ENV['HOME'])
|
187
|
+
unless File.exist?(daemon_f) and cli.ask_user(
|
188
|
+
'A template daemon already exists, do you want to preserve it?',
|
189
|
+
'yes', %w(yes no)) == 'yes'
|
190
|
+
v = {created: Time.now.to_s, updated: Time.now.to_s}
|
191
|
+
v[:type] = cli.ask_user(
|
192
|
+
'Please select the type of daemon you want to setup',
|
193
|
+
cli[:dtype], %w(bash qsub msub slurm))
|
194
|
+
case v[:type]
|
195
|
+
when 'bash'
|
196
|
+
v[:latency] = cli.ask_user(
|
197
|
+
'How long should I sleep? (in seconds)', '30').to_i
|
198
|
+
v[:maxjobs] = cli.ask_user(
|
199
|
+
'How many jobs can I launch at once?', '6').to_i
|
200
|
+
v[:ppn] = cli.ask_user(
|
201
|
+
'How many CPUs can I use per job?', '2').to_i
|
202
|
+
cli.puts 'Setting up internal daemon defaults.'
|
203
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
204
|
+
v[:cmd] = cli.ask_user(
|
205
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
206
|
+
"%2$s: variables, %3$d: CPUs, %4$s: log file, %5$s: task name.\n",
|
207
|
+
"%2$s '%1$s' > '%4$s' 2>&1")
|
208
|
+
v[:var] = cli.ask_user(
|
209
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
210
|
+
"%1$s=%2$s")
|
211
|
+
v[:varsep] = cli.ask_user(
|
212
|
+
'What should I use to separate variables?', ' ')
|
213
|
+
v[:alive] = cli.ask_user(
|
214
|
+
"How can I know that a process is still alive?\n %1$s: PID, " \
|
215
|
+
"output should be 1 for running and 0 for non-running.\n",
|
216
|
+
"ps -p '%1$s'|tail -n+2|wc -l")
|
217
|
+
v[:kill] = cli.ask_user(
|
218
|
+
"How should I terminate tasks?\n %s: process ID.", "kill -9 '%s'")
|
219
|
+
when 'slurm'
|
220
|
+
queue = cli.ask_user(
|
221
|
+
'What queue should I use?', nil, nil, true)
|
222
|
+
v[:latency] = cli.ask_user(
|
223
|
+
'How long should I sleep? (in seconds)', '150').to_i
|
224
|
+
v[:maxjobs] = cli.ask_user(
|
225
|
+
'How many jobs can I launch at once?', '300').to_i
|
226
|
+
v[:ppn] = cli.ask_user(
|
227
|
+
'How many CPUs can I use per job?', '2').to_i
|
228
|
+
cli.puts 'Setting up internal daemon defaults'
|
229
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
230
|
+
v[:cmd] = cli.ask_user(
|
231
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
232
|
+
"%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
|
233
|
+
"%2$s sbatch --partition='#{queue}' --export=ALL " \
|
234
|
+
"--nodes=1 --ntasks-per-node=%3$d --output='%4$s' " \
|
235
|
+
"--job-name='%5$s' --mem=9G --time=12:00:00 %1$s " \
|
236
|
+
"| perl -pe 's/.* //'")
|
237
|
+
v[:var] = cli.ask_user(
|
238
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
239
|
+
"%1$s=%2$s")
|
240
|
+
v[:varsep] = cli.ask_user(
|
241
|
+
'What should I use to separate variables?', ' ')
|
242
|
+
v[:alive] = cli.ask_user(
|
243
|
+
"How can I know that a process is still alive?\n %1$s: job id, " \
|
244
|
+
"output should be 1 for running and 0 for non-running.\n",
|
245
|
+
"squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " \
|
246
|
+
"| tail -n 1 | wc -l")
|
247
|
+
v[:kill] = cli.ask_user(
|
248
|
+
"How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
|
249
|
+
else # [qm]sub
|
250
|
+
queue = cli.ask_user('What queue should I use?', nil, nil, true)
|
251
|
+
v[:latency] = cli.ask_user(
|
252
|
+
'How long should I sleep? (in seconds)', '150').to_i
|
253
|
+
v[:maxjobs] = cli.ask_user(
|
254
|
+
'How many jobs can I launch at once?', '300').to_i
|
255
|
+
v[:ppn] = cli.ask_user(
|
256
|
+
'How many CPUs can I use per job?', '2').to_i
|
257
|
+
cli.puts 'Setting up internal daemon defaults.'
|
258
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
259
|
+
v[:cmd] = cli.ask_user(
|
260
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
261
|
+
"%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
|
262
|
+
"#{v[:type]} -q '#{queue}' -v '%2$s' -l nodes=1:ppn=%3$d %1$s " \
|
263
|
+
"-j oe -o '%4$s' -N '%5$s' -l mem=9g -l walltime=12:00:00 " \
|
264
|
+
"| grep .")
|
265
|
+
v[:var] = cli.ask_user(
|
266
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
267
|
+
"%1$s=%2$s")
|
268
|
+
v[:varsep] = cli.ask_user(
|
269
|
+
'What should I use to separate variables?', ',')
|
270
|
+
if v[:type] == 'qsub'
|
271
|
+
v[:alive] = cli.ask_user(
|
272
|
+
"How can I know that a process is still alive?\n " \
|
273
|
+
"%1$s: job id, output should be 1 for running and " \
|
274
|
+
"0 for non-running.\n",
|
275
|
+
"qstat -f '%1$s'|grep ' job_state ='|perl -pe 's/.*= //'" \
|
276
|
+
"|grep '[^C]'|tail -n1|wc -l|awk '{print $1}'")
|
277
|
+
v[:kill] = cli.ask_user(
|
278
|
+
"How should I terminate tasks?\n %s: process ID.", "qdel '%s'")
|
279
|
+
else # msub
|
280
|
+
v[:alive] = cli.ask_user(
|
281
|
+
"How can I know that a process is still alive?\n " \
|
282
|
+
"%1$s: job id, output should be 1 for running and " \
|
283
|
+
"0 for non-running.\n",
|
284
|
+
"checkjob '%1$s'|grep '^State:'|perl -pe 's/.*: //'" \
|
285
|
+
"|grep 'Deferred\\|Hold\\|Idle\\|Starting\\|Running\\|Blocked'" \
|
286
|
+
"|tail -n1|wc -l|awk '{print $1}'")
|
287
|
+
v[:kill] = cli.ask_user(
|
288
|
+
"How should I terminate tasks?\n %s: process ID.",
|
289
|
+
"canceljob '%s'")
|
290
|
+
end
|
291
|
+
end
|
292
|
+
File.open(daemon_f, 'w') { |fh| fh.puts JSON.pretty_generate(v) }
|
293
|
+
end
|
294
|
+
cli.puts ''
|
295
|
+
|
296
|
+
rc_fh.puts <<FOOT
|
297
|
+
|
298
|
+
MIGA_CONFIG_VERSION='#{MiGA::MiGA.VERSION}'
|
299
|
+
MIGA_CONFIG_LONGVERSION='#{MiGA::MiGA.LONG_VERSION}'
|
300
|
+
MIGA_CONFIG_DATE='#{Time.now}'
|
301
|
+
|
302
|
+
FOOT
|
303
|
+
|
304
|
+
cli.puts 'Configuration complete. MiGA is ready to work!'
|
305
|
+
cli.puts ''
|
306
|
+
|
307
|
+
end
|
308
|
+
|
309
|
+
def empty_action
|
310
|
+
end
|
311
|
+
|
312
|
+
def run_cmd(cli, cmd)
|
313
|
+
`. "#{cli[:config]}" && #{cmd}`
|
314
|
+
end
|
315
|
+
|
316
|
+
def run_r_cmd(cli, paths, cmd)
|
317
|
+
run_cmd(cli,
|
318
|
+
"echo #{cmd.shellescape} | #{paths['R'].shellescape} --vanilla -q 2>&1")
|
319
|
+
end
|
320
|
+
|
321
|
+
def test_r_package(cli, paths, pkg)
|
322
|
+
run_r_cmd(cli, paths, "library('#{pkg}')")
|
323
|
+
$?.success?
|
324
|
+
end
|
325
|
+
|
326
|
+
def install_r_package(cli, paths, pkg)
|
327
|
+
r_cmd = "install.packages('#{pkg}', repos='http://cran.rstudio.com/')"
|
328
|
+
run_r_cmd(cli, paths, r_cmd)
|
329
|
+
end
|
330
|
+
|
331
|
+
def test_ruby_gem(cli, paths, pkg)
|
332
|
+
run_cmd(cli,
|
333
|
+
"#{paths['ruby'].shellescape} -r #{pkg.shellescape} -e '' 2>/dev/null")
|
334
|
+
$?.success?
|
335
|
+
end
|
336
|
+
|
337
|
+
def install_ruby_gem(cli, paths, pkg)
|
338
|
+
gem_cmd = "Gem::GemRunner.new.run %w(install --user #{pkg})"
|
339
|
+
run_cmd(cli, "#{paths['ruby'].shellescape} \
|
340
|
+
-r rubygems -r rubygems/gem_runner \
|
341
|
+
-e #{gem_cmd.shellescape} 2>&1")
|
342
|
+
end
|
343
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
|
6
|
+
class MiGA::Cli::Action::Ln < MiGA::Cli::Action
|
7
|
+
|
8
|
+
def parse_cli
|
9
|
+
cli.defaults = {info: false, force: false, method: :hardlink}
|
10
|
+
cli.parse do |opt|
|
11
|
+
cli.opt_object(opt, [:project, :dataset_opt])
|
12
|
+
opt.on(
|
13
|
+
'-Q', '--project-target PATH',
|
14
|
+
'(Mandatory) Path to the project where to link the dataset'
|
15
|
+
){ |v| cli[:project2] = v }
|
16
|
+
opt.on(
|
17
|
+
'-f', '--force',
|
18
|
+
'Force linking, even if dataset\'s preprocessing is incomplete'
|
19
|
+
){ |v| cli[:force] = v }
|
20
|
+
opt.on(
|
21
|
+
'-s', '--symlink',
|
22
|
+
'Create symlinks instead of the default hard links'
|
23
|
+
){ cli[:method] = :symlink }
|
24
|
+
opt.on(
|
25
|
+
'-c', '--copy',
|
26
|
+
'Create copies instead of the default hard links'
|
27
|
+
){ cli[:method] = :copy }
|
28
|
+
cli.opt_filter_datasets(opt)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def perform
|
33
|
+
p = cli.load_project
|
34
|
+
q = cli.load_project(:project2, '-Q')
|
35
|
+
ds = cli.load_and_filter_datasets
|
36
|
+
ds.each do |d|
|
37
|
+
next unless cli[:force] or d.done_preprocessing?
|
38
|
+
cli.puts d.name
|
39
|
+
q.import_dataset(d, cli[:method])
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
|
6
|
+
class MiGA::Cli::Action::Ls < MiGA::Cli::Action
|
7
|
+
|
8
|
+
def parse_cli
|
9
|
+
cli.defaults = {info: false, processing: false, silent: false}
|
10
|
+
cli.parse do |opt|
|
11
|
+
cli.opt_object(opt, [:project, :dataset_opt])
|
12
|
+
cli.opt_filter_datasets(opt)
|
13
|
+
opt.on(
|
14
|
+
'-i', '--info',
|
15
|
+
'Print additional information on each dataset'
|
16
|
+
){ |v| cli[:info] = v }
|
17
|
+
opt.on(
|
18
|
+
'-p', '--processing',
|
19
|
+
'Print information on processing advance'
|
20
|
+
){ |v| cli[:processing] = v }
|
21
|
+
opt.on(
|
22
|
+
'-m', '--metadata STRING',
|
23
|
+
'Print name and metadata field only',
|
24
|
+
'If set, ignores -i and assumes --tab'
|
25
|
+
){ |v| cli[:datum] = v }
|
26
|
+
opt.on(
|
27
|
+
'--tab',
|
28
|
+
'Return a tab-delimited table'
|
29
|
+
){ |v| cli[:tabular] = v }
|
30
|
+
opt.on(
|
31
|
+
'-s', '--silent',
|
32
|
+
'No output and exit with non-zero status if the dataset list is empty'
|
33
|
+
){ |v| cli[:silent] = v }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def perform
|
38
|
+
ds = cli.load_and_filter_datasets(cli[:silent])
|
39
|
+
exit(ds.empty? ? 1 : 0) if cli[:silent]
|
40
|
+
if !cli[:datum].nil?
|
41
|
+
ds.each do |d|
|
42
|
+
v = d.metadata[cli[:datum]]
|
43
|
+
puts "#{d.name}\t#{v.nil? ? '?' : v}"
|
44
|
+
end
|
45
|
+
elsif cli[:info]
|
46
|
+
cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info })
|
47
|
+
elsif cli[:processing]
|
48
|
+
comp = %w[- done queued]
|
49
|
+
cli.table([:name] + MiGA::Dataset.PREPROCESSING_TASKS,
|
50
|
+
ds.map { |d| [d.name] + d.profile_advance.map { |i| comp[i] } })
|
51
|
+
else
|
52
|
+
ds.each { |d| cli.puts d.name }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,218 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
require 'miga/cli/action'
|
5
|
+
require 'miga/remote_dataset'
|
6
|
+
require 'csv'
|
7
|
+
|
8
|
+
class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
9
|
+
|
10
|
+
def parse_cli
|
11
|
+
cli.defaults = {query: false, unlink: false,
|
12
|
+
reference: false, legacy_name: false,
|
13
|
+
complete: false, chromosome: false,
|
14
|
+
scaffold: false, contig: false, add_version: true, dry: false,
|
15
|
+
get_md: false, only_md: false, save_every: 1}
|
16
|
+
cli.parse do |opt|
|
17
|
+
cli.opt_object(opt, [:project])
|
18
|
+
opt.on(
|
19
|
+
'-T', '--taxon STRING',
|
20
|
+
'(Mandatory unless --reference) Taxon name (e.g., a species binomial)'
|
21
|
+
){ |v| cli[:taxon] = v }
|
22
|
+
opt.on('--reference',
|
23
|
+
'Download all reference genomes (ignore any other status)'
|
24
|
+
){ |v| cli[:reference] = v }
|
25
|
+
opt.on(
|
26
|
+
'--complete',
|
27
|
+
'Download complete genomes'
|
28
|
+
){ |v| cli[:complete] = v }
|
29
|
+
opt.on('--chromosome',
|
30
|
+
'Download complete chromosomes'
|
31
|
+
){ |v| cli[:chromosome] = v }
|
32
|
+
opt.on(
|
33
|
+
'--scaffold',
|
34
|
+
'Download genomes in scaffolds'
|
35
|
+
){ |v| cli[:scaffold] = v }
|
36
|
+
opt.on(
|
37
|
+
'--contig',
|
38
|
+
'Download genomes in contigs'
|
39
|
+
){ |v| cli[:contig] = v }
|
40
|
+
opt.on(
|
41
|
+
'--all',
|
42
|
+
'Download all genomes (in any status)') do
|
43
|
+
cli[:complete] = true
|
44
|
+
cli[:chromosome] = true
|
45
|
+
cli[:scaffold] = true
|
46
|
+
cli[:contig] = true
|
47
|
+
end
|
48
|
+
opt.on(
|
49
|
+
'--no-version-name',
|
50
|
+
'Do not add sequence version to the dataset name',
|
51
|
+
'Only affects --complete and --chromosome'
|
52
|
+
){ |v| cli[:add_version] = v }
|
53
|
+
opt.on(
|
54
|
+
'--legacy-name',
|
55
|
+
'Use dataset names based on chromosome entries instead of assembly'
|
56
|
+
){ |v| cli[:legacy_name] = v }
|
57
|
+
opt.on('--blacklist PATH',
|
58
|
+
'A file with dataset names to blacklist'
|
59
|
+
){ |v| cli[:blacklist] = v }
|
60
|
+
opt.on(
|
61
|
+
'--dry',
|
62
|
+
'Do not download or save the datasets'
|
63
|
+
){ |v| cli[:dry] = v }
|
64
|
+
opt.on(
|
65
|
+
'--get-metadata',
|
66
|
+
'Only download and update metadata for existing datasets'
|
67
|
+
){ |v| cli[:get_md] = v }
|
68
|
+
opt.on('--only-metadata',
|
69
|
+
'Create datasets without input data but retrieve all metadata'
|
70
|
+
){ |v| cli[:only_md] = v }
|
71
|
+
opt.on(
|
72
|
+
'--save-every INT', Integer,
|
73
|
+
'Save project every this many downloaded datasets',
|
74
|
+
'If zero, it saves the project only once upon completion',
|
75
|
+
"By default: #{cli[:save_every]}"
|
76
|
+
){ |v| cli[:save_every] = v }
|
77
|
+
opt.on(
|
78
|
+
'-q', '--query',
|
79
|
+
'Register the datasets as queries, not reference datasets'
|
80
|
+
){ |v| cli[:query] = v }
|
81
|
+
opt.on(
|
82
|
+
'-u', '--unlink',
|
83
|
+
'Unlink all datasets in the project missing from the download list'
|
84
|
+
){ |v| cli[:unlink] = v }
|
85
|
+
opt.on('-R', '--remote-list PATH',
|
86
|
+
'Path to an output file with the list of all datasets listed remotely'
|
87
|
+
){ |v| cli[:remote_list] = v }
|
88
|
+
opt.on(
|
89
|
+
'--api-key STRING',
|
90
|
+
'NCBI API key'
|
91
|
+
){ |v| ENV['NCBI_API_KEY'] = v }
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def perform
|
96
|
+
cli.ensure_par(taxon: '-T') unless cli[:reference]
|
97
|
+
unless %w[reference complete chromosome scaffold contig].any?{ |i| cli[i.to_sym] }
|
98
|
+
raise 'No action requested: pick at least one type of genome'
|
99
|
+
end
|
100
|
+
cli[:save_every] = 1 if cli[:dry]
|
101
|
+
|
102
|
+
p = cli.load_project
|
103
|
+
d = []
|
104
|
+
ds = {}
|
105
|
+
downloaded = 0
|
106
|
+
|
107
|
+
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
|
108
|
+
url_param = {
|
109
|
+
q: '[display()].' +
|
110
|
+
'from(GenomeAssemblies).' +
|
111
|
+
'usingschema(/schema/GenomeAssemblies).' +
|
112
|
+
'matching(tab==["Prokaryotes"] and q=="' + cli[:taxon].tr('"',"'") + '"',
|
113
|
+
fields: 'organism|organism,assembly|assembly,replicons|replicons,' +
|
114
|
+
'level|level,ftp_path_genbank|ftp_path_genbank,release_date|release_date,' +
|
115
|
+
'strain|strain',
|
116
|
+
nolimit: 'on',
|
117
|
+
}
|
118
|
+
if cli[:reference]
|
119
|
+
url_param[:q] += ' and refseq_category==["representative"]'
|
120
|
+
else
|
121
|
+
status = {
|
122
|
+
complete: 'Complete',
|
123
|
+
chromosome: ' Chromosome', # <- The leading space is *VERY* important!
|
124
|
+
scaffold: 'Scaffold',
|
125
|
+
contig: 'Contig'
|
126
|
+
}.map { |k, v| '"' + v + '"' if cli[k] }.compact.join(',')
|
127
|
+
url_param[:q] += ' and level==[' + status + ']'
|
128
|
+
end
|
129
|
+
url_param[:q] += ')'
|
130
|
+
url = url_base + URI.encode_www_form(url_param)
|
131
|
+
cli.say 'Downloading genome list'
|
132
|
+
lineno = 0
|
133
|
+
doc = RemoteDataset.download_url(url)
|
134
|
+
CSV.parse(doc, headers: true).each do |r|
|
135
|
+
asm = r['assembly']
|
136
|
+
next if asm.nil? or asm.empty? or asm == '-'
|
137
|
+
next unless r['ftp_path_genbank']
|
138
|
+
|
139
|
+
# Get replicons
|
140
|
+
rep = r['replicons'].nil? ? nil : r['replicons'].
|
141
|
+
split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
|
142
|
+
|
143
|
+
# Set name
|
144
|
+
if cli[:legacy_name] and cli[:reference]
|
145
|
+
n = r['#organism'].miga_name
|
146
|
+
else
|
147
|
+
if cli[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
|
148
|
+
acc = rep.nil? ? '' : rep.first
|
149
|
+
else
|
150
|
+
acc = asm
|
151
|
+
end
|
152
|
+
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
153
|
+
n = "#{r['#organism']}_#{acc}".miga_name
|
154
|
+
end
|
155
|
+
|
156
|
+
# Register for download
|
157
|
+
fna_url = r['ftp_path_genbank'] + '/' +
|
158
|
+
File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
|
159
|
+
ds[n] = {
|
160
|
+
ids: [fna_url], db: :assembly_gz, universe: :web,
|
161
|
+
md: {
|
162
|
+
type: :genome, ncbi_asm: asm, strain: r['strain']
|
163
|
+
}
|
164
|
+
}
|
165
|
+
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
166
|
+
ds[n][:md][:release_date] =
|
167
|
+
Time.parse(r['release_date']).to_s unless r['release_date'].nil?
|
168
|
+
end
|
169
|
+
|
170
|
+
# Discard blacklisted
|
171
|
+
unless cli[:blacklist].nil?
|
172
|
+
cli.say "Discarding datasets in #{cli[:blacklist]}"
|
173
|
+
File.readlines(cli[:blacklist]).
|
174
|
+
select{ |i| i !~ /^#/ }.map(&:chomp).each{ |i| ds.delete i }
|
175
|
+
end
|
176
|
+
|
177
|
+
# Download entries
|
178
|
+
cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
|
179
|
+
p.do_not_save = true if cli[:save_every] != 1
|
180
|
+
ds.each do |name, body|
|
181
|
+
d << name
|
182
|
+
cli.puts name
|
183
|
+
next if p.dataset(name).nil? == cli[:get_md]
|
184
|
+
downloaded += 1
|
185
|
+
next if cli[:dry]
|
186
|
+
cli.say ' Locating remote dataset'
|
187
|
+
body[:md][:metadata_only] = true if cli[:only_md]
|
188
|
+
rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
189
|
+
if cli[:get_md]
|
190
|
+
cli.say ' Updating dataset'
|
191
|
+
rd.update_metadata(p.dataset(name), body[:md])
|
192
|
+
else
|
193
|
+
cli.say ' Creating dataset'
|
194
|
+
rd.save_to(p, name, !cli[:query], body[:md])
|
195
|
+
p.add_dataset(name)
|
196
|
+
end
|
197
|
+
p.save! if cli[:save_every] > 1 and (downloaded % cli[:save_every]) == 0
|
198
|
+
end
|
199
|
+
|
200
|
+
p.do_not_save = false
|
201
|
+
p.save! if cli[:save_every] != 1
|
202
|
+
|
203
|
+
# Finalize
|
204
|
+
cli.say "Datasets listed: #{d.size}"
|
205
|
+
cli.say "Datasets #{cli[:dry] ? 'to download' : 'downloaded'}: " +
|
206
|
+
downloaded.to_s
|
207
|
+
unless cli[:remote_list].nil?
|
208
|
+
File.open(cli[:remote_list], 'w') do |fh|
|
209
|
+
d.each { |i| fh.puts i }
|
210
|
+
end
|
211
|
+
end
|
212
|
+
if cli[:unlink]
|
213
|
+
unlink = p.dataset_names - d
|
214
|
+
unlink.each { |i| p.unlink_dataset(i).remove! }
|
215
|
+
cli.say "Datasets unlinked: #{unlink.size}"
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|