miga-base 0.4.3.0 → 0.5.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/miga/cli.rb +43 -223
- data/lib/miga/cli/action/add.rb +91 -62
- data/lib/miga/cli/action/classify_wf.rb +97 -0
- data/lib/miga/cli/action/daemon.rb +14 -10
- data/lib/miga/cli/action/derep_wf.rb +95 -0
- data/lib/miga/cli/action/doctor.rb +83 -55
- data/lib/miga/cli/action/get.rb +68 -52
- data/lib/miga/cli/action/get_db.rb +206 -0
- data/lib/miga/cli/action/index_wf.rb +31 -0
- data/lib/miga/cli/action/init.rb +115 -190
- data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
- data/lib/miga/cli/action/ls.rb +20 -11
- data/lib/miga/cli/action/ncbi_get.rb +199 -157
- data/lib/miga/cli/action/preproc_wf.rb +46 -0
- data/lib/miga/cli/action/quality_wf.rb +45 -0
- data/lib/miga/cli/action/stats.rb +147 -99
- data/lib/miga/cli/action/summary.rb +10 -4
- data/lib/miga/cli/action/tax_dist.rb +61 -46
- data/lib/miga/cli/action/tax_test.rb +46 -39
- data/lib/miga/cli/action/wf.rb +178 -0
- data/lib/miga/cli/base.rb +11 -0
- data/lib/miga/cli/objects_helper.rb +88 -0
- data/lib/miga/cli/opt_helper.rb +160 -0
- data/lib/miga/daemon.rb +7 -4
- data/lib/miga/dataset/base.rb +5 -5
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -1
- data/lib/miga/remote_dataset/base.rb +5 -5
- data/lib/miga/remote_dataset/download.rb +1 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/cds.bash +3 -1
- data/scripts/essential_genes.bash +1 -0
- data/scripts/stats.bash +1 -1
- data/scripts/trimmed_fasta.bash +5 -3
- data/utils/distance/runner.rb +3 -0
- data/utils/distance/temporal.rb +10 -1
- data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
- data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
- data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
- data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
- data/utils/enveomics/Scripts/SRA.download.bash +1 -1
- data/utils/enveomics/Scripts/aai.rb +163 -128
- data/utils/enveomics/build_enveomics_r.bash +11 -10
- data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
- data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
- data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
- data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
- data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
- data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
- data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
- data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
- data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
- data/utils/enveomics/enveomics.R/R/utils.R +31 -15
- data/utils/enveomics/enveomics.R/README.md +7 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
- data/utils/find-medoid.R +3 -2
- data/utils/representatives.rb +5 -3
- data/utils/subclade/pipeline.rb +22 -11
- data/utils/subclade/runner.rb +5 -1
- data/utils/subclades-compile.rb +1 -1
- data/utils/subclades.R +9 -3
- metadata +15 -4
- data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -0,0 +1,124 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
##
|
5
|
+
# Helper module with daemon configuration functions for MiGA::Cli::Action::Init
|
6
|
+
module MiGA::Cli::Action::Init::DaemonHelper
|
7
|
+
def configure_daemon
|
8
|
+
cli.puts 'Default daemon configuration:'
|
9
|
+
daemon_f = File.expand_path('.miga_daemon.json', ENV['HOME'])
|
10
|
+
unless File.exist?(daemon_f) and cli.ask_user(
|
11
|
+
'A template daemon already exists, do you want to preserve it?',
|
12
|
+
'yes', %w(yes no)) == 'yes'
|
13
|
+
v = {created: Time.now.to_s, updated: Time.now.to_s}
|
14
|
+
v[:type] = cli.ask_user(
|
15
|
+
'Please select the type of daemon you want to setup',
|
16
|
+
cli[:dtype], %w(bash qsub msub slurm))
|
17
|
+
case v[:type]
|
18
|
+
when 'bash'
|
19
|
+
v = configure_bash_daemon(v)
|
20
|
+
when 'slurm'
|
21
|
+
v = configure_slurm_daemon(v)
|
22
|
+
else # [qm]sub
|
23
|
+
v = configure_qsub_msub_daemon(v)
|
24
|
+
end
|
25
|
+
File.open(daemon_f, 'w') { |fh| fh.puts JSON.pretty_generate(v) }
|
26
|
+
end
|
27
|
+
cli.puts ''
|
28
|
+
end
|
29
|
+
|
30
|
+
def configure_bash_daemon(v)
|
31
|
+
v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '2').to_i
|
32
|
+
v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '6').to_i
|
33
|
+
v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
|
34
|
+
cli.puts 'Setting up internal daemon defaults.'
|
35
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
36
|
+
v[:cmd] = cli.ask_user(
|
37
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
38
|
+
"%2$s: variables, %3$d: CPUs, %4$s: log file, %5$s: task name.\n",
|
39
|
+
"%2$s '%1$s' > '%4$s' 2>&1")
|
40
|
+
v[:var] = cli.ask_user(
|
41
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
42
|
+
"%1$s=%2$s")
|
43
|
+
v[:varsep] = cli.ask_user('What should I use to separate variables?', ' ')
|
44
|
+
v[:alive] = cli.ask_user(
|
45
|
+
"How can I know that a process is still alive?\n %1$s: PID, " \
|
46
|
+
"output should be 1 for running and 0 for non-running.\n",
|
47
|
+
"ps -p '%1$s'|tail -n+2|wc -l")
|
48
|
+
v[:kill] = cli.ask_user(
|
49
|
+
"How should I terminate tasks?\n %s: process ID.", "kill -9 '%s'")
|
50
|
+
v
|
51
|
+
end
|
52
|
+
|
53
|
+
def configure_slurm_daemon(v)
|
54
|
+
queue = cli.ask_user('What queue should I use?', nil, nil, true)
|
55
|
+
v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '150').to_i
|
56
|
+
v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '300').to_i
|
57
|
+
v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
|
58
|
+
cli.puts 'Setting up internal daemon defaults'
|
59
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
60
|
+
v[:cmd] = cli.ask_user(
|
61
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
62
|
+
"%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
|
63
|
+
"%2$s sbatch --partition='#{queue}' --export=ALL " \
|
64
|
+
"--nodes=1 --ntasks-per-node=%3$d --output='%4$s' " \
|
65
|
+
"--job-name='%5$s' --mem=9G --time=12:00:00 %1$s " \
|
66
|
+
"| perl -pe 's/.* //'")
|
67
|
+
v[:var] = cli.ask_user(
|
68
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
69
|
+
"%1$s=%2$s")
|
70
|
+
v[:varsep] = cli.ask_user(
|
71
|
+
'What should I use to separate variables?', ' ')
|
72
|
+
v[:alive] = cli.ask_user(
|
73
|
+
"How can I know that a process is still alive?\n %1$s: job id, " \
|
74
|
+
"output should be 1 for running and 0 for non-running.\n",
|
75
|
+
"squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " \
|
76
|
+
"| tail -n 1 | wc -l")
|
77
|
+
v[:kill] = cli.ask_user(
|
78
|
+
"How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
|
79
|
+
v
|
80
|
+
end
|
81
|
+
|
82
|
+
def configure_qsub_msub_daemon
|
83
|
+
queue = cli.ask_user('What queue should I use?', nil, nil, true)
|
84
|
+
v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '150').to_i
|
85
|
+
v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '300').to_i
|
86
|
+
v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
|
87
|
+
cli.puts 'Setting up internal daemon defaults.'
|
88
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
89
|
+
v[:cmd] = cli.ask_user(
|
90
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
91
|
+
"%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
|
92
|
+
"#{v[:type]} -q '#{queue}' -v '%2$s' -l nodes=1:ppn=%3$d %1$s " \
|
93
|
+
"-j oe -o '%4$s' -N '%5$s' -l mem=9g -l walltime=12:00:00 " \
|
94
|
+
"| grep .")
|
95
|
+
v[:var] = cli.ask_user(
|
96
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
97
|
+
"%1$s=%2$s")
|
98
|
+
v[:varsep] = cli.ask_user(
|
99
|
+
'What should I use to separate variables?', ',')
|
100
|
+
if v[:type] == 'qsub'
|
101
|
+
v[:alive] = cli.ask_user(
|
102
|
+
"How can I know that a process is still alive?\n " \
|
103
|
+
"%1$s: job id, output should be 1 for running and " \
|
104
|
+
"0 for non-running.\n",
|
105
|
+
"qstat -f '%1$s'|grep ' job_state ='|perl -pe 's/.*= //'" \
|
106
|
+
"|grep '[^C]'|tail -n1|wc -l|awk '{print $1}'")
|
107
|
+
v[:kill] = cli.ask_user(
|
108
|
+
"How should I terminate tasks?\n %s: process ID.", "qdel '%s'")
|
109
|
+
else # msub
|
110
|
+
v[:alive] = cli.ask_user(
|
111
|
+
"How can I know that a process is still alive?\n " \
|
112
|
+
"%1$s: job id, output should be 1 for running and " \
|
113
|
+
"0 for non-running.\n",
|
114
|
+
"checkjob '%1$s'|grep '^State:'|perl -pe 's/.*: //'" \
|
115
|
+
"|grep 'Deferred\\|Hold\\|Idle\\|Starting\\|Running\\|Blocked'" \
|
116
|
+
"|tail -n1|wc -l|awk '{print $1}'")
|
117
|
+
v[:kill] = cli.ask_user(
|
118
|
+
"How should I terminate tasks?\n %s: process ID.",
|
119
|
+
"canceljob '%s'")
|
120
|
+
end
|
121
|
+
v
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
data/lib/miga/cli/action/ls.rb
CHANGED
@@ -6,50 +6,59 @@ require 'miga/cli/action'
|
|
6
6
|
class MiGA::Cli::Action::Ls < MiGA::Cli::Action
|
7
7
|
|
8
8
|
def parse_cli
|
9
|
-
cli.defaults = {info: false, processing: false, silent: false}
|
9
|
+
cli.defaults = { info: false, processing: false, silent: false }
|
10
10
|
cli.parse do |opt|
|
11
11
|
cli.opt_object(opt, [:project, :dataset_opt])
|
12
12
|
cli.opt_filter_datasets(opt)
|
13
13
|
opt.on(
|
14
14
|
'-i', '--info',
|
15
15
|
'Print additional information on each dataset'
|
16
|
-
|
16
|
+
) { |v| cli[:info] = v }
|
17
17
|
opt.on(
|
18
18
|
'-p', '--processing',
|
19
19
|
'Print information on processing advance'
|
20
|
-
|
20
|
+
) { |v| cli[:processing] = v }
|
21
21
|
opt.on(
|
22
22
|
'-m', '--metadata STRING',
|
23
23
|
'Print name and metadata field only',
|
24
24
|
'If set, ignores -i and assumes --tab'
|
25
|
-
|
25
|
+
) { |v| cli[:datum] = v }
|
26
26
|
opt.on(
|
27
27
|
'--tab',
|
28
28
|
'Return a tab-delimited table'
|
29
|
-
|
29
|
+
) { |v| cli[:tabular] = v }
|
30
|
+
opt.on(
|
31
|
+
'-o', '--output PATH',
|
32
|
+
'Create output file instead of returning to STDOUT'
|
33
|
+
) { |v| cli[:output] = v }
|
30
34
|
opt.on(
|
31
35
|
'-s', '--silent',
|
32
36
|
'No output and exit with non-zero status if the dataset list is empty'
|
33
|
-
|
37
|
+
) { |v| cli[:silent] = v }
|
34
38
|
end
|
35
39
|
end
|
36
40
|
|
37
41
|
def perform
|
38
42
|
ds = cli.load_and_filter_datasets(cli[:silent])
|
39
43
|
exit(ds.empty? ? 1 : 0) if cli[:silent]
|
44
|
+
io = cli[:output].nil? ? $stdout : File.open(cli[:output], 'w')
|
40
45
|
if !cli[:datum].nil?
|
41
46
|
ds.each do |d|
|
42
47
|
v = d.metadata[cli[:datum]]
|
43
|
-
puts "#{d.name}\t#{v.nil? ? '?' : v}"
|
48
|
+
cli.puts(io, "#{d.name}\t#{v.nil? ? '?' : v}")
|
44
49
|
end
|
45
50
|
elsif cli[:info]
|
46
|
-
cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info })
|
51
|
+
cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info }, io)
|
47
52
|
elsif cli[:processing]
|
48
53
|
comp = %w[- done queued]
|
49
|
-
cli.table(
|
50
|
-
|
54
|
+
cli.table(
|
55
|
+
[:name] + MiGA::Dataset.PREPROCESSING_TASKS,
|
56
|
+
ds.map { |d| [d.name] + d.profile_advance.map { |i| comp[i] } },
|
57
|
+
io
|
58
|
+
)
|
51
59
|
else
|
52
|
-
ds.each { |d| cli.puts d.name }
|
60
|
+
ds.each { |d| cli.puts(io, d.name) }
|
53
61
|
end
|
62
|
+
io.close unless cli[:output].nil?
|
54
63
|
end
|
55
64
|
end
|
@@ -6,118 +6,202 @@ require 'miga/remote_dataset'
|
|
6
6
|
require 'csv'
|
7
7
|
|
8
8
|
class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
9
|
-
|
10
9
|
def parse_cli
|
11
|
-
cli.defaults = {
|
10
|
+
cli.defaults = {
|
11
|
+
query: false, unlink: false,
|
12
12
|
reference: false, legacy_name: false,
|
13
13
|
complete: false, chromosome: false,
|
14
14
|
scaffold: false, contig: false, add_version: true, dry: false,
|
15
|
-
get_md: false, only_md: false, save_every: 1
|
15
|
+
get_md: false, only_md: false, save_every: 1
|
16
|
+
}
|
16
17
|
cli.parse do |opt|
|
17
18
|
cli.opt_object(opt, [:project])
|
18
19
|
opt.on(
|
19
20
|
'-T', '--taxon STRING',
|
20
21
|
'(Mandatory unless --reference) Taxon name (e.g., a species binomial)'
|
21
|
-
|
22
|
-
opt.on('--reference',
|
23
|
-
'Download all reference genomes (ignore any other status)'
|
24
|
-
){ |v| cli[:reference] = v }
|
25
|
-
opt.on(
|
26
|
-
'--complete',
|
27
|
-
'Download complete genomes'
|
28
|
-
){ |v| cli[:complete] = v }
|
29
|
-
opt.on('--chromosome',
|
30
|
-
'Download complete chromosomes'
|
31
|
-
){ |v| cli[:chromosome] = v }
|
32
|
-
opt.on(
|
33
|
-
'--scaffold',
|
34
|
-
'Download genomes in scaffolds'
|
35
|
-
){ |v| cli[:scaffold] = v }
|
36
|
-
opt.on(
|
37
|
-
'--contig',
|
38
|
-
'Download genomes in contigs'
|
39
|
-
){ |v| cli[:contig] = v }
|
40
|
-
opt.on(
|
41
|
-
'--all',
|
42
|
-
'Download all genomes (in any status)') do
|
43
|
-
cli[:complete] = true
|
44
|
-
cli[:chromosome] = true
|
45
|
-
cli[:scaffold] = true
|
46
|
-
cli[:contig] = true
|
47
|
-
end
|
48
|
-
opt.on(
|
49
|
-
'--no-version-name',
|
50
|
-
'Do not add sequence version to the dataset name',
|
51
|
-
'Only affects --complete and --chromosome'
|
52
|
-
){ |v| cli[:add_version] = v }
|
53
|
-
opt.on(
|
54
|
-
'--legacy-name',
|
55
|
-
'Use dataset names based on chromosome entries instead of assembly'
|
56
|
-
){ |v| cli[:legacy_name] = v }
|
57
|
-
opt.on('--blacklist PATH',
|
58
|
-
'A file with dataset names to blacklist'
|
59
|
-
){ |v| cli[:blacklist] = v }
|
60
|
-
opt.on(
|
61
|
-
'--dry',
|
62
|
-
'Do not download or save the datasets'
|
63
|
-
){ |v| cli[:dry] = v }
|
64
|
-
opt.on(
|
65
|
-
'--ignore-until STRING',
|
66
|
-
'Ignores all datasets until a name is found (useful for large reruns)'
|
67
|
-
){ |v| cli[:ignore_until] = v }
|
68
|
-
opt.on(
|
69
|
-
'--get-metadata',
|
70
|
-
'Only download and update metadata for existing datasets'
|
71
|
-
){ |v| cli[:get_md] = v }
|
72
|
-
opt.on('--only-metadata',
|
73
|
-
'Create datasets without input data but retrieve all metadata'
|
74
|
-
){ |v| cli[:only_md] = v }
|
75
|
-
opt.on(
|
76
|
-
'--save-every INT', Integer,
|
77
|
-
'Save project every this many downloaded datasets',
|
78
|
-
'If zero, it saves the project only once upon completion',
|
79
|
-
"By default: #{cli[:save_every]}"
|
80
|
-
){ |v| cli[:save_every] = v }
|
22
|
+
) { |v| cli[:taxon] = v }
|
81
23
|
opt.on(
|
82
|
-
'-
|
83
|
-
'
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
opt
|
90
|
-
'Path to an output file with the list of all datasets listed remotely'
|
91
|
-
){ |v| cli[:remote_list] = v }
|
24
|
+
'-m', '--metadata STRING',
|
25
|
+
'Metadata as key-value pairs separated by = and delimited by comma',
|
26
|
+
'Values are saved as strings except for booleans (true / false) or nil'
|
27
|
+
) { |v| cli[:metadata] = v }
|
28
|
+
cli_task_flags(opt)
|
29
|
+
cli_name_modifiers(opt)
|
30
|
+
cli_filters(opt)
|
31
|
+
cli_save_actions(opt)
|
92
32
|
opt.on(
|
93
33
|
'--api-key STRING',
|
94
34
|
'NCBI API key'
|
95
|
-
|
35
|
+
) { |v| ENV['NCBI_API_KEY'] = v }
|
96
36
|
end
|
97
37
|
end
|
98
38
|
|
99
39
|
def perform
|
40
|
+
sanitize_cli
|
41
|
+
p = cli.load_project
|
42
|
+
ds = remote_list
|
43
|
+
ds = discard_blacklisted(ds)
|
44
|
+
d, downloaded = download_entries(ds, p)
|
45
|
+
|
46
|
+
# Finalize
|
47
|
+
cli.say "Datasets listed: #{d.size}"
|
48
|
+
act = cli[:dry] ? 'to download' : 'downloaded'
|
49
|
+
cli.say "Datasets #{act}: #{downloaded}"
|
50
|
+
unless cli[:remote_list].nil?
|
51
|
+
File.open(cli[:remote_list], 'w') do |fh|
|
52
|
+
d.each { |i| fh.puts i }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
return unless cli[:unlink]
|
56
|
+
unlink = p.dataset_names - d
|
57
|
+
unlink.each { |i| p.unlink_dataset(i).remove! }
|
58
|
+
cli.say "Datasets unlinked: #{unlink.size}"
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def cli_task_flags(opt)
|
64
|
+
cli.opt_flag(
|
65
|
+
opt, 'reference',
|
66
|
+
'Download all reference genomes (ignore any other status)'
|
67
|
+
)
|
68
|
+
cli.opt_flag(opt, 'complete', 'Download complete genomes')
|
69
|
+
cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes')
|
70
|
+
cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds')
|
71
|
+
cli.opt_flag(opt, 'contig', 'Download genomes in contigs')
|
72
|
+
opt.on(
|
73
|
+
'--all',
|
74
|
+
'Download all genomes (in any status)'
|
75
|
+
) do
|
76
|
+
cli[:complete] = true
|
77
|
+
cli[:chromosome] = true
|
78
|
+
cli[:scaffold] = true
|
79
|
+
cli[:contig] = true
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def cli_name_modifiers(opt)
|
84
|
+
opt.on(
|
85
|
+
'--no-version-name',
|
86
|
+
'Do not add sequence version to the dataset name',
|
87
|
+
'Only affects --complete and --chromosome'
|
88
|
+
) { |v| cli[:add_version] = v }
|
89
|
+
cli.opt_flag(
|
90
|
+
opt, 'legacy-name',
|
91
|
+
'Use dataset names based on chromosome entries instead of assembly',
|
92
|
+
:legacy_name
|
93
|
+
)
|
94
|
+
end
|
95
|
+
|
96
|
+
def cli_filters(opt)
|
97
|
+
opt.on(
|
98
|
+
'--blacklist PATH',
|
99
|
+
'A file with dataset names to blacklist'
|
100
|
+
) { |v| cli[:blacklist] = v }
|
101
|
+
cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
|
102
|
+
opt.on(
|
103
|
+
'--ignore-until STRING',
|
104
|
+
'Ignores all datasets until a name is found (useful for large reruns)'
|
105
|
+
) { |v| cli[:ignore_until] = v }
|
106
|
+
cli.opt_flag(
|
107
|
+
opt, 'get-metadata',
|
108
|
+
'Only download and update metadata for existing datasets', :get_md)
|
109
|
+
end
|
110
|
+
|
111
|
+
def cli_save_actions(opt)
|
112
|
+
cli.opt_flag(
|
113
|
+
opt, 'only-metadata',
|
114
|
+
'Create datasets without input data but retrieve all metadata',
|
115
|
+
:only_md)
|
116
|
+
opt.on(
|
117
|
+
'--save-every INT', Integer,
|
118
|
+
'Save project every this many downloaded datasets',
|
119
|
+
'If zero, it saves the project only once upon completion',
|
120
|
+
"By default: #{cli[:save_every]}"
|
121
|
+
) { |v| cli[:save_every] = v }
|
122
|
+
opt.on(
|
123
|
+
'-q', '--query',
|
124
|
+
'Register the datasets as queries, not reference datasets'
|
125
|
+
) { |v| cli[:query] = v }
|
126
|
+
opt.on(
|
127
|
+
'-u', '--unlink',
|
128
|
+
'Unlink all datasets in the project missing from the download list'
|
129
|
+
) { |v| cli[:unlink] = v }
|
130
|
+
opt.on(
|
131
|
+
'-R', '--remote-list PATH',
|
132
|
+
'Path to an output file with the list of all datasets listed remotely'
|
133
|
+
) { |v| cli[:remote_list] = v }
|
134
|
+
end
|
135
|
+
|
136
|
+
def sanitize_cli
|
100
137
|
cli.ensure_par(taxon: '-T') unless cli[:reference]
|
101
|
-
|
138
|
+
tasks = %w[reference complete chromosome scaffold contig]
|
139
|
+
unless tasks.any? { |i| cli[i.to_sym] }
|
102
140
|
raise 'No action requested: pick at least one type of genome'
|
103
141
|
end
|
104
142
|
cli[:save_every] = 1 if cli[:dry]
|
143
|
+
end
|
105
144
|
|
106
|
-
|
107
|
-
|
145
|
+
def remote_list
|
146
|
+
cli.say 'Downloading genome list'
|
108
147
|
ds = {}
|
109
|
-
|
148
|
+
url = remote_list_url
|
149
|
+
doc = RemoteDataset.download_url(url)
|
150
|
+
CSV.parse(doc, headers: true).each do |r|
|
151
|
+
asm = r['assembly']
|
152
|
+
next if asm.nil? || asm.empty? || asm == '-'
|
153
|
+
next unless r['ftp_path_genbank']
|
154
|
+
rep = remote_row_replicons(r)
|
155
|
+
n = remote_row_name(r, rep, asm)
|
156
|
+
|
157
|
+
# Register for download
|
158
|
+
fna_url = '%s/%s_genomic.fna.gz' %
|
159
|
+
[r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])]
|
160
|
+
ds[n] = {
|
161
|
+
ids: [fna_url], db: :assembly_gz, universe: :web,
|
162
|
+
md: {
|
163
|
+
type: :genome, ncbi_asm: asm, strain: r['strain']
|
164
|
+
}
|
165
|
+
}
|
166
|
+
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
167
|
+
unless r['release_date'].nil?
|
168
|
+
ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
|
169
|
+
end
|
170
|
+
end
|
171
|
+
ds
|
172
|
+
end
|
110
173
|
|
174
|
+
def remote_row_replicons(r)
|
175
|
+
return if r['replicons'].nil?
|
176
|
+
r['replicons']
|
177
|
+
.split('; ')
|
178
|
+
.map { |i| i.gsub(/.*:/, '') }
|
179
|
+
.map { |i| i.gsub(%r{/.*}, '') }
|
180
|
+
end
|
181
|
+
|
182
|
+
def remote_row_name(r, rep, asm)
|
183
|
+
return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
|
184
|
+
if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
|
185
|
+
acc = rep.nil? ? '' : rep.first
|
186
|
+
else
|
187
|
+
acc = asm
|
188
|
+
end
|
189
|
+
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
190
|
+
"#{r['#organism']}_#{acc}".miga_name
|
191
|
+
end
|
192
|
+
|
193
|
+
def remote_list_url
|
111
194
|
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
|
112
195
|
url_param = {
|
113
|
-
q: '[display()].'
|
114
|
-
'from(GenomeAssemblies).'
|
115
|
-
'usingschema(/schema/GenomeAssemblies).'
|
116
|
-
'matching(tab==["Prokaryotes"] and q=="'
|
117
|
-
|
118
|
-
|
119
|
-
'
|
120
|
-
|
196
|
+
q: '[display()].' \
|
197
|
+
'from(GenomeAssemblies).' \
|
198
|
+
'usingschema(/schema/GenomeAssemblies).' \
|
199
|
+
'matching(tab==["Prokaryotes"] and q=="' \
|
200
|
+
"#{cli[:taxon].tr('"', "'")}\"",
|
201
|
+
fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
|
202
|
+
'level|level,ftp_path_genbank|ftp_path_genbank,' \
|
203
|
+
'release_date|release_date,strain|strain',
|
204
|
+
nolimit: 'on'
|
121
205
|
}
|
122
206
|
if cli[:reference]
|
123
207
|
url_param[:q] += ' and refseq_category==["representative"]'
|
@@ -131,95 +215,53 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
131
215
|
url_param[:q] += ' and level==[' + status + ']'
|
132
216
|
end
|
133
217
|
url_param[:q] += ')'
|
134
|
-
|
135
|
-
|
136
|
-
lineno = 0
|
137
|
-
doc = RemoteDataset.download_url(url)
|
138
|
-
CSV.parse(doc, headers: true).each do |r|
|
139
|
-
asm = r['assembly']
|
140
|
-
next if asm.nil? or asm.empty? or asm == '-'
|
141
|
-
next unless r['ftp_path_genbank']
|
142
|
-
|
143
|
-
# Get replicons
|
144
|
-
rep = r['replicons'].nil? ? nil : r['replicons'].
|
145
|
-
split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
|
146
|
-
|
147
|
-
# Set name
|
148
|
-
if cli[:legacy_name] and cli[:reference]
|
149
|
-
n = r['#organism'].miga_name
|
150
|
-
else
|
151
|
-
if cli[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
|
152
|
-
acc = rep.nil? ? '' : rep.first
|
153
|
-
else
|
154
|
-
acc = asm
|
155
|
-
end
|
156
|
-
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
157
|
-
n = "#{r['#organism']}_#{acc}".miga_name
|
158
|
-
end
|
159
|
-
|
160
|
-
# Register for download
|
161
|
-
fna_url = r['ftp_path_genbank'] + '/' +
|
162
|
-
File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
|
163
|
-
ds[n] = {
|
164
|
-
ids: [fna_url], db: :assembly_gz, universe: :web,
|
165
|
-
md: {
|
166
|
-
type: :genome, ncbi_asm: asm, strain: r['strain']
|
167
|
-
}
|
168
|
-
}
|
169
|
-
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
170
|
-
ds[n][:md][:release_date] =
|
171
|
-
Time.parse(r['release_date']).to_s unless r['release_date'].nil?
|
172
|
-
end
|
218
|
+
url_base + URI.encode_www_form(url_param)
|
219
|
+
end
|
173
220
|
|
174
|
-
|
221
|
+
def discard_blacklisted(ds)
|
175
222
|
unless cli[:blacklist].nil?
|
176
223
|
cli.say "Discarding datasets in #{cli[:blacklist]}"
|
177
|
-
File.readlines(cli[:blacklist])
|
178
|
-
|
224
|
+
File.readlines(cli[:blacklist])
|
225
|
+
.select { |i| i !~ /^#/ }
|
226
|
+
.map(&:chomp)
|
227
|
+
.each { |i| ds.delete i }
|
179
228
|
end
|
229
|
+
ds
|
230
|
+
end
|
180
231
|
|
181
|
-
|
232
|
+
def download_entries(ds, p)
|
182
233
|
cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
|
183
234
|
p.do_not_save = true if cli[:save_every] != 1
|
184
235
|
ignore = !cli[:ignore_until].nil?
|
236
|
+
downloaded = 0
|
237
|
+
d = []
|
185
238
|
ds.each do |name, body|
|
186
239
|
d << name
|
187
240
|
cli.puts name
|
188
241
|
ignore = false if ignore && name == cli[:ignore_until]
|
189
|
-
next if ignore
|
190
|
-
next if p.dataset(name).nil? == cli[:get_md]
|
242
|
+
next if ignore || p.dataset(name).nil? == cli[:get_md]
|
191
243
|
downloaded += 1
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
196
|
-
if cli[:get_md]
|
197
|
-
cli.say ' Updating dataset'
|
198
|
-
rd.update_metadata(p.dataset(name), body[:md])
|
199
|
-
else
|
200
|
-
cli.say ' Creating dataset'
|
201
|
-
rd.save_to(p, name, !cli[:query], body[:md])
|
202
|
-
p.add_dataset(name)
|
244
|
+
unless cli[:dry]
|
245
|
+
save_entry(name, body, p)
|
246
|
+
p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
|
203
247
|
end
|
204
|
-
p.save! if cli[:save_every] > 1 and (downloaded % cli[:save_every]) == 0
|
205
248
|
end
|
206
|
-
|
207
249
|
p.do_not_save = false
|
208
250
|
p.save! if cli[:save_every] != 1
|
251
|
+
[d, downloaded]
|
252
|
+
end
|
209
253
|
|
210
|
-
|
211
|
-
cli.say
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
unlink.each { |i| p.unlink_dataset(i).remove! }
|
222
|
-
cli.say "Datasets unlinked: #{unlink.size}"
|
254
|
+
def save_entry(name, body, p)
|
255
|
+
cli.say ' Locating remote dataset'
|
256
|
+
body[:md][:metadata_only] = true if cli[:only_md]
|
257
|
+
rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
258
|
+
if cli[:get_md]
|
259
|
+
cli.say ' Updating dataset'
|
260
|
+
rd.update_metadata(p.dataset(name), body[:md])
|
261
|
+
else
|
262
|
+
cli.say ' Creating dataset'
|
263
|
+
rd.save_to(p, name, !cli[:query], body[:md])
|
264
|
+
cli.add_metadata(p.add_dataset(name)).save
|
223
265
|
end
|
224
266
|
end
|
225
267
|
end
|