miga-base 0.4.3.0 → 0.5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/miga/cli.rb +43 -223
- data/lib/miga/cli/action/add.rb +91 -62
- data/lib/miga/cli/action/classify_wf.rb +97 -0
- data/lib/miga/cli/action/daemon.rb +14 -10
- data/lib/miga/cli/action/derep_wf.rb +95 -0
- data/lib/miga/cli/action/doctor.rb +83 -55
- data/lib/miga/cli/action/get.rb +68 -52
- data/lib/miga/cli/action/get_db.rb +206 -0
- data/lib/miga/cli/action/index_wf.rb +31 -0
- data/lib/miga/cli/action/init.rb +115 -190
- data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
- data/lib/miga/cli/action/ls.rb +20 -11
- data/lib/miga/cli/action/ncbi_get.rb +199 -157
- data/lib/miga/cli/action/preproc_wf.rb +46 -0
- data/lib/miga/cli/action/quality_wf.rb +45 -0
- data/lib/miga/cli/action/stats.rb +147 -99
- data/lib/miga/cli/action/summary.rb +10 -4
- data/lib/miga/cli/action/tax_dist.rb +61 -46
- data/lib/miga/cli/action/tax_test.rb +46 -39
- data/lib/miga/cli/action/wf.rb +178 -0
- data/lib/miga/cli/base.rb +11 -0
- data/lib/miga/cli/objects_helper.rb +88 -0
- data/lib/miga/cli/opt_helper.rb +160 -0
- data/lib/miga/daemon.rb +7 -4
- data/lib/miga/dataset/base.rb +5 -5
- data/lib/miga/project/base.rb +4 -4
- data/lib/miga/project/result.rb +2 -1
- data/lib/miga/remote_dataset/base.rb +5 -5
- data/lib/miga/remote_dataset/download.rb +1 -1
- data/lib/miga/version.rb +3 -3
- data/scripts/cds.bash +3 -1
- data/scripts/essential_genes.bash +1 -0
- data/scripts/stats.bash +1 -1
- data/scripts/trimmed_fasta.bash +5 -3
- data/utils/distance/runner.rb +3 -0
- data/utils/distance/temporal.rb +10 -1
- data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
- data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
- data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
- data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
- data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
- data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
- data/utils/enveomics/Scripts/SRA.download.bash +1 -1
- data/utils/enveomics/Scripts/aai.rb +163 -128
- data/utils/enveomics/build_enveomics_r.bash +11 -10
- data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
- data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
- data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
- data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
- data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
- data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
- data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
- data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
- data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
- data/utils/enveomics/enveomics.R/R/utils.R +31 -15
- data/utils/enveomics/enveomics.R/README.md +7 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
- data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
- data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
- data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
- data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
- data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
- data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
- data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
- data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
- data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
- data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
- data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
- data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
- data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
- data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
- data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
- data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
- data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
- data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
- data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
- data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
- data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
- data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
- data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
- data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
- data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
- data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
- data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
- data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
- data/utils/find-medoid.R +3 -2
- data/utils/representatives.rb +5 -3
- data/utils/subclade/pipeline.rb +22 -11
- data/utils/subclade/runner.rb +5 -1
- data/utils/subclades-compile.rb +1 -1
- data/utils/subclades.R +9 -3
- metadata +15 -4
- data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
- data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -0,0 +1,124 @@
|
|
1
|
+
# @package MiGA
|
2
|
+
# @license Artistic-2.0
|
3
|
+
|
4
|
+
##
|
5
|
+
# Helper module with daemon configuration functions for MiGA::Cli::Action::Init
|
6
|
+
module MiGA::Cli::Action::Init::DaemonHelper
|
7
|
+
def configure_daemon
|
8
|
+
cli.puts 'Default daemon configuration:'
|
9
|
+
daemon_f = File.expand_path('.miga_daemon.json', ENV['HOME'])
|
10
|
+
unless File.exist?(daemon_f) and cli.ask_user(
|
11
|
+
'A template daemon already exists, do you want to preserve it?',
|
12
|
+
'yes', %w(yes no)) == 'yes'
|
13
|
+
v = {created: Time.now.to_s, updated: Time.now.to_s}
|
14
|
+
v[:type] = cli.ask_user(
|
15
|
+
'Please select the type of daemon you want to setup',
|
16
|
+
cli[:dtype], %w(bash qsub msub slurm))
|
17
|
+
case v[:type]
|
18
|
+
when 'bash'
|
19
|
+
v = configure_bash_daemon(v)
|
20
|
+
when 'slurm'
|
21
|
+
v = configure_slurm_daemon(v)
|
22
|
+
else # [qm]sub
|
23
|
+
v = configure_qsub_msub_daemon(v)
|
24
|
+
end
|
25
|
+
File.open(daemon_f, 'w') { |fh| fh.puts JSON.pretty_generate(v) }
|
26
|
+
end
|
27
|
+
cli.puts ''
|
28
|
+
end
|
29
|
+
|
30
|
+
def configure_bash_daemon(v)
|
31
|
+
v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '2').to_i
|
32
|
+
v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '6').to_i
|
33
|
+
v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
|
34
|
+
cli.puts 'Setting up internal daemon defaults.'
|
35
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
36
|
+
v[:cmd] = cli.ask_user(
|
37
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
38
|
+
"%2$s: variables, %3$d: CPUs, %4$s: log file, %5$s: task name.\n",
|
39
|
+
"%2$s '%1$s' > '%4$s' 2>&1")
|
40
|
+
v[:var] = cli.ask_user(
|
41
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
42
|
+
"%1$s=%2$s")
|
43
|
+
v[:varsep] = cli.ask_user('What should I use to separate variables?', ' ')
|
44
|
+
v[:alive] = cli.ask_user(
|
45
|
+
"How can I know that a process is still alive?\n %1$s: PID, " \
|
46
|
+
"output should be 1 for running and 0 for non-running.\n",
|
47
|
+
"ps -p '%1$s'|tail -n+2|wc -l")
|
48
|
+
v[:kill] = cli.ask_user(
|
49
|
+
"How should I terminate tasks?\n %s: process ID.", "kill -9 '%s'")
|
50
|
+
v
|
51
|
+
end
|
52
|
+
|
53
|
+
def configure_slurm_daemon(v)
|
54
|
+
queue = cli.ask_user('What queue should I use?', nil, nil, true)
|
55
|
+
v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '150').to_i
|
56
|
+
v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '300').to_i
|
57
|
+
v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
|
58
|
+
cli.puts 'Setting up internal daemon defaults'
|
59
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
60
|
+
v[:cmd] = cli.ask_user(
|
61
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
62
|
+
"%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
|
63
|
+
"%2$s sbatch --partition='#{queue}' --export=ALL " \
|
64
|
+
"--nodes=1 --ntasks-per-node=%3$d --output='%4$s' " \
|
65
|
+
"--job-name='%5$s' --mem=9G --time=12:00:00 %1$s " \
|
66
|
+
"| perl -pe 's/.* //'")
|
67
|
+
v[:var] = cli.ask_user(
|
68
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
69
|
+
"%1$s=%2$s")
|
70
|
+
v[:varsep] = cli.ask_user(
|
71
|
+
'What should I use to separate variables?', ' ')
|
72
|
+
v[:alive] = cli.ask_user(
|
73
|
+
"How can I know that a process is still alive?\n %1$s: job id, " \
|
74
|
+
"output should be 1 for running and 0 for non-running.\n",
|
75
|
+
"squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " \
|
76
|
+
"| tail -n 1 | wc -l")
|
77
|
+
v[:kill] = cli.ask_user(
|
78
|
+
"How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
|
79
|
+
v
|
80
|
+
end
|
81
|
+
|
82
|
+
def configure_qsub_msub_daemon
|
83
|
+
queue = cli.ask_user('What queue should I use?', nil, nil, true)
|
84
|
+
v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '150').to_i
|
85
|
+
v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '300').to_i
|
86
|
+
v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
|
87
|
+
cli.puts 'Setting up internal daemon defaults.'
|
88
|
+
cli.puts 'If you don\'t understand this just leave default values:'
|
89
|
+
v[:cmd] = cli.ask_user(
|
90
|
+
"How should I launch tasks?\n %1$s: script path, " \
|
91
|
+
"%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
|
92
|
+
"#{v[:type]} -q '#{queue}' -v '%2$s' -l nodes=1:ppn=%3$d %1$s " \
|
93
|
+
"-j oe -o '%4$s' -N '%5$s' -l mem=9g -l walltime=12:00:00 " \
|
94
|
+
"| grep .")
|
95
|
+
v[:var] = cli.ask_user(
|
96
|
+
"How should I pass variables?\n %1$s: keys, %2$s: values.\n",
|
97
|
+
"%1$s=%2$s")
|
98
|
+
v[:varsep] = cli.ask_user(
|
99
|
+
'What should I use to separate variables?', ',')
|
100
|
+
if v[:type] == 'qsub'
|
101
|
+
v[:alive] = cli.ask_user(
|
102
|
+
"How can I know that a process is still alive?\n " \
|
103
|
+
"%1$s: job id, output should be 1 for running and " \
|
104
|
+
"0 for non-running.\n",
|
105
|
+
"qstat -f '%1$s'|grep ' job_state ='|perl -pe 's/.*= //'" \
|
106
|
+
"|grep '[^C]'|tail -n1|wc -l|awk '{print $1}'")
|
107
|
+
v[:kill] = cli.ask_user(
|
108
|
+
"How should I terminate tasks?\n %s: process ID.", "qdel '%s'")
|
109
|
+
else # msub
|
110
|
+
v[:alive] = cli.ask_user(
|
111
|
+
"How can I know that a process is still alive?\n " \
|
112
|
+
"%1$s: job id, output should be 1 for running and " \
|
113
|
+
"0 for non-running.\n",
|
114
|
+
"checkjob '%1$s'|grep '^State:'|perl -pe 's/.*: //'" \
|
115
|
+
"|grep 'Deferred\\|Hold\\|Idle\\|Starting\\|Running\\|Blocked'" \
|
116
|
+
"|tail -n1|wc -l|awk '{print $1}'")
|
117
|
+
v[:kill] = cli.ask_user(
|
118
|
+
"How should I terminate tasks?\n %s: process ID.",
|
119
|
+
"canceljob '%s'")
|
120
|
+
end
|
121
|
+
v
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
data/lib/miga/cli/action/ls.rb
CHANGED
@@ -6,50 +6,59 @@ require 'miga/cli/action'
|
|
6
6
|
class MiGA::Cli::Action::Ls < MiGA::Cli::Action
|
7
7
|
|
8
8
|
def parse_cli
|
9
|
-
cli.defaults = {info: false, processing: false, silent: false}
|
9
|
+
cli.defaults = { info: false, processing: false, silent: false }
|
10
10
|
cli.parse do |opt|
|
11
11
|
cli.opt_object(opt, [:project, :dataset_opt])
|
12
12
|
cli.opt_filter_datasets(opt)
|
13
13
|
opt.on(
|
14
14
|
'-i', '--info',
|
15
15
|
'Print additional information on each dataset'
|
16
|
-
|
16
|
+
) { |v| cli[:info] = v }
|
17
17
|
opt.on(
|
18
18
|
'-p', '--processing',
|
19
19
|
'Print information on processing advance'
|
20
|
-
|
20
|
+
) { |v| cli[:processing] = v }
|
21
21
|
opt.on(
|
22
22
|
'-m', '--metadata STRING',
|
23
23
|
'Print name and metadata field only',
|
24
24
|
'If set, ignores -i and assumes --tab'
|
25
|
-
|
25
|
+
) { |v| cli[:datum] = v }
|
26
26
|
opt.on(
|
27
27
|
'--tab',
|
28
28
|
'Return a tab-delimited table'
|
29
|
-
|
29
|
+
) { |v| cli[:tabular] = v }
|
30
|
+
opt.on(
|
31
|
+
'-o', '--output PATH',
|
32
|
+
'Create output file instead of returning to STDOUT'
|
33
|
+
) { |v| cli[:output] = v }
|
30
34
|
opt.on(
|
31
35
|
'-s', '--silent',
|
32
36
|
'No output and exit with non-zero status if the dataset list is empty'
|
33
|
-
|
37
|
+
) { |v| cli[:silent] = v }
|
34
38
|
end
|
35
39
|
end
|
36
40
|
|
37
41
|
def perform
|
38
42
|
ds = cli.load_and_filter_datasets(cli[:silent])
|
39
43
|
exit(ds.empty? ? 1 : 0) if cli[:silent]
|
44
|
+
io = cli[:output].nil? ? $stdout : File.open(cli[:output], 'w')
|
40
45
|
if !cli[:datum].nil?
|
41
46
|
ds.each do |d|
|
42
47
|
v = d.metadata[cli[:datum]]
|
43
|
-
puts "#{d.name}\t#{v.nil? ? '?' : v}"
|
48
|
+
cli.puts(io, "#{d.name}\t#{v.nil? ? '?' : v}")
|
44
49
|
end
|
45
50
|
elsif cli[:info]
|
46
|
-
cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info })
|
51
|
+
cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info }, io)
|
47
52
|
elsif cli[:processing]
|
48
53
|
comp = %w[- done queued]
|
49
|
-
cli.table(
|
50
|
-
|
54
|
+
cli.table(
|
55
|
+
[:name] + MiGA::Dataset.PREPROCESSING_TASKS,
|
56
|
+
ds.map { |d| [d.name] + d.profile_advance.map { |i| comp[i] } },
|
57
|
+
io
|
58
|
+
)
|
51
59
|
else
|
52
|
-
ds.each { |d| cli.puts d.name }
|
60
|
+
ds.each { |d| cli.puts(io, d.name) }
|
53
61
|
end
|
62
|
+
io.close unless cli[:output].nil?
|
54
63
|
end
|
55
64
|
end
|
@@ -6,118 +6,202 @@ require 'miga/remote_dataset'
|
|
6
6
|
require 'csv'
|
7
7
|
|
8
8
|
class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
9
|
-
|
10
9
|
def parse_cli
|
11
|
-
cli.defaults = {
|
10
|
+
cli.defaults = {
|
11
|
+
query: false, unlink: false,
|
12
12
|
reference: false, legacy_name: false,
|
13
13
|
complete: false, chromosome: false,
|
14
14
|
scaffold: false, contig: false, add_version: true, dry: false,
|
15
|
-
get_md: false, only_md: false, save_every: 1
|
15
|
+
get_md: false, only_md: false, save_every: 1
|
16
|
+
}
|
16
17
|
cli.parse do |opt|
|
17
18
|
cli.opt_object(opt, [:project])
|
18
19
|
opt.on(
|
19
20
|
'-T', '--taxon STRING',
|
20
21
|
'(Mandatory unless --reference) Taxon name (e.g., a species binomial)'
|
21
|
-
|
22
|
-
opt.on('--reference',
|
23
|
-
'Download all reference genomes (ignore any other status)'
|
24
|
-
){ |v| cli[:reference] = v }
|
25
|
-
opt.on(
|
26
|
-
'--complete',
|
27
|
-
'Download complete genomes'
|
28
|
-
){ |v| cli[:complete] = v }
|
29
|
-
opt.on('--chromosome',
|
30
|
-
'Download complete chromosomes'
|
31
|
-
){ |v| cli[:chromosome] = v }
|
32
|
-
opt.on(
|
33
|
-
'--scaffold',
|
34
|
-
'Download genomes in scaffolds'
|
35
|
-
){ |v| cli[:scaffold] = v }
|
36
|
-
opt.on(
|
37
|
-
'--contig',
|
38
|
-
'Download genomes in contigs'
|
39
|
-
){ |v| cli[:contig] = v }
|
40
|
-
opt.on(
|
41
|
-
'--all',
|
42
|
-
'Download all genomes (in any status)') do
|
43
|
-
cli[:complete] = true
|
44
|
-
cli[:chromosome] = true
|
45
|
-
cli[:scaffold] = true
|
46
|
-
cli[:contig] = true
|
47
|
-
end
|
48
|
-
opt.on(
|
49
|
-
'--no-version-name',
|
50
|
-
'Do not add sequence version to the dataset name',
|
51
|
-
'Only affects --complete and --chromosome'
|
52
|
-
){ |v| cli[:add_version] = v }
|
53
|
-
opt.on(
|
54
|
-
'--legacy-name',
|
55
|
-
'Use dataset names based on chromosome entries instead of assembly'
|
56
|
-
){ |v| cli[:legacy_name] = v }
|
57
|
-
opt.on('--blacklist PATH',
|
58
|
-
'A file with dataset names to blacklist'
|
59
|
-
){ |v| cli[:blacklist] = v }
|
60
|
-
opt.on(
|
61
|
-
'--dry',
|
62
|
-
'Do not download or save the datasets'
|
63
|
-
){ |v| cli[:dry] = v }
|
64
|
-
opt.on(
|
65
|
-
'--ignore-until STRING',
|
66
|
-
'Ignores all datasets until a name is found (useful for large reruns)'
|
67
|
-
){ |v| cli[:ignore_until] = v }
|
68
|
-
opt.on(
|
69
|
-
'--get-metadata',
|
70
|
-
'Only download and update metadata for existing datasets'
|
71
|
-
){ |v| cli[:get_md] = v }
|
72
|
-
opt.on('--only-metadata',
|
73
|
-
'Create datasets without input data but retrieve all metadata'
|
74
|
-
){ |v| cli[:only_md] = v }
|
75
|
-
opt.on(
|
76
|
-
'--save-every INT', Integer,
|
77
|
-
'Save project every this many downloaded datasets',
|
78
|
-
'If zero, it saves the project only once upon completion',
|
79
|
-
"By default: #{cli[:save_every]}"
|
80
|
-
){ |v| cli[:save_every] = v }
|
22
|
+
) { |v| cli[:taxon] = v }
|
81
23
|
opt.on(
|
82
|
-
'-
|
83
|
-
'
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
opt
|
90
|
-
'Path to an output file with the list of all datasets listed remotely'
|
91
|
-
){ |v| cli[:remote_list] = v }
|
24
|
+
'-m', '--metadata STRING',
|
25
|
+
'Metadata as key-value pairs separated by = and delimited by comma',
|
26
|
+
'Values are saved as strings except for booleans (true / false) or nil'
|
27
|
+
) { |v| cli[:metadata] = v }
|
28
|
+
cli_task_flags(opt)
|
29
|
+
cli_name_modifiers(opt)
|
30
|
+
cli_filters(opt)
|
31
|
+
cli_save_actions(opt)
|
92
32
|
opt.on(
|
93
33
|
'--api-key STRING',
|
94
34
|
'NCBI API key'
|
95
|
-
|
35
|
+
) { |v| ENV['NCBI_API_KEY'] = v }
|
96
36
|
end
|
97
37
|
end
|
98
38
|
|
99
39
|
def perform
|
40
|
+
sanitize_cli
|
41
|
+
p = cli.load_project
|
42
|
+
ds = remote_list
|
43
|
+
ds = discard_blacklisted(ds)
|
44
|
+
d, downloaded = download_entries(ds, p)
|
45
|
+
|
46
|
+
# Finalize
|
47
|
+
cli.say "Datasets listed: #{d.size}"
|
48
|
+
act = cli[:dry] ? 'to download' : 'downloaded'
|
49
|
+
cli.say "Datasets #{act}: #{downloaded}"
|
50
|
+
unless cli[:remote_list].nil?
|
51
|
+
File.open(cli[:remote_list], 'w') do |fh|
|
52
|
+
d.each { |i| fh.puts i }
|
53
|
+
end
|
54
|
+
end
|
55
|
+
return unless cli[:unlink]
|
56
|
+
unlink = p.dataset_names - d
|
57
|
+
unlink.each { |i| p.unlink_dataset(i).remove! }
|
58
|
+
cli.say "Datasets unlinked: #{unlink.size}"
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def cli_task_flags(opt)
|
64
|
+
cli.opt_flag(
|
65
|
+
opt, 'reference',
|
66
|
+
'Download all reference genomes (ignore any other status)'
|
67
|
+
)
|
68
|
+
cli.opt_flag(opt, 'complete', 'Download complete genomes')
|
69
|
+
cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes')
|
70
|
+
cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds')
|
71
|
+
cli.opt_flag(opt, 'contig', 'Download genomes in contigs')
|
72
|
+
opt.on(
|
73
|
+
'--all',
|
74
|
+
'Download all genomes (in any status)'
|
75
|
+
) do
|
76
|
+
cli[:complete] = true
|
77
|
+
cli[:chromosome] = true
|
78
|
+
cli[:scaffold] = true
|
79
|
+
cli[:contig] = true
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def cli_name_modifiers(opt)
|
84
|
+
opt.on(
|
85
|
+
'--no-version-name',
|
86
|
+
'Do not add sequence version to the dataset name',
|
87
|
+
'Only affects --complete and --chromosome'
|
88
|
+
) { |v| cli[:add_version] = v }
|
89
|
+
cli.opt_flag(
|
90
|
+
opt, 'legacy-name',
|
91
|
+
'Use dataset names based on chromosome entries instead of assembly',
|
92
|
+
:legacy_name
|
93
|
+
)
|
94
|
+
end
|
95
|
+
|
96
|
+
def cli_filters(opt)
|
97
|
+
opt.on(
|
98
|
+
'--blacklist PATH',
|
99
|
+
'A file with dataset names to blacklist'
|
100
|
+
) { |v| cli[:blacklist] = v }
|
101
|
+
cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
|
102
|
+
opt.on(
|
103
|
+
'--ignore-until STRING',
|
104
|
+
'Ignores all datasets until a name is found (useful for large reruns)'
|
105
|
+
) { |v| cli[:ignore_until] = v }
|
106
|
+
cli.opt_flag(
|
107
|
+
opt, 'get-metadata',
|
108
|
+
'Only download and update metadata for existing datasets', :get_md)
|
109
|
+
end
|
110
|
+
|
111
|
+
def cli_save_actions(opt)
|
112
|
+
cli.opt_flag(
|
113
|
+
opt, 'only-metadata',
|
114
|
+
'Create datasets without input data but retrieve all metadata',
|
115
|
+
:only_md)
|
116
|
+
opt.on(
|
117
|
+
'--save-every INT', Integer,
|
118
|
+
'Save project every this many downloaded datasets',
|
119
|
+
'If zero, it saves the project only once upon completion',
|
120
|
+
"By default: #{cli[:save_every]}"
|
121
|
+
) { |v| cli[:save_every] = v }
|
122
|
+
opt.on(
|
123
|
+
'-q', '--query',
|
124
|
+
'Register the datasets as queries, not reference datasets'
|
125
|
+
) { |v| cli[:query] = v }
|
126
|
+
opt.on(
|
127
|
+
'-u', '--unlink',
|
128
|
+
'Unlink all datasets in the project missing from the download list'
|
129
|
+
) { |v| cli[:unlink] = v }
|
130
|
+
opt.on(
|
131
|
+
'-R', '--remote-list PATH',
|
132
|
+
'Path to an output file with the list of all datasets listed remotely'
|
133
|
+
) { |v| cli[:remote_list] = v }
|
134
|
+
end
|
135
|
+
|
136
|
+
def sanitize_cli
|
100
137
|
cli.ensure_par(taxon: '-T') unless cli[:reference]
|
101
|
-
|
138
|
+
tasks = %w[reference complete chromosome scaffold contig]
|
139
|
+
unless tasks.any? { |i| cli[i.to_sym] }
|
102
140
|
raise 'No action requested: pick at least one type of genome'
|
103
141
|
end
|
104
142
|
cli[:save_every] = 1 if cli[:dry]
|
143
|
+
end
|
105
144
|
|
106
|
-
|
107
|
-
|
145
|
+
def remote_list
|
146
|
+
cli.say 'Downloading genome list'
|
108
147
|
ds = {}
|
109
|
-
|
148
|
+
url = remote_list_url
|
149
|
+
doc = RemoteDataset.download_url(url)
|
150
|
+
CSV.parse(doc, headers: true).each do |r|
|
151
|
+
asm = r['assembly']
|
152
|
+
next if asm.nil? || asm.empty? || asm == '-'
|
153
|
+
next unless r['ftp_path_genbank']
|
154
|
+
rep = remote_row_replicons(r)
|
155
|
+
n = remote_row_name(r, rep, asm)
|
156
|
+
|
157
|
+
# Register for download
|
158
|
+
fna_url = '%s/%s_genomic.fna.gz' %
|
159
|
+
[r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])]
|
160
|
+
ds[n] = {
|
161
|
+
ids: [fna_url], db: :assembly_gz, universe: :web,
|
162
|
+
md: {
|
163
|
+
type: :genome, ncbi_asm: asm, strain: r['strain']
|
164
|
+
}
|
165
|
+
}
|
166
|
+
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
167
|
+
unless r['release_date'].nil?
|
168
|
+
ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
|
169
|
+
end
|
170
|
+
end
|
171
|
+
ds
|
172
|
+
end
|
110
173
|
|
174
|
+
def remote_row_replicons(r)
|
175
|
+
return if r['replicons'].nil?
|
176
|
+
r['replicons']
|
177
|
+
.split('; ')
|
178
|
+
.map { |i| i.gsub(/.*:/, '') }
|
179
|
+
.map { |i| i.gsub(%r{/.*}, '') }
|
180
|
+
end
|
181
|
+
|
182
|
+
def remote_row_name(r, rep, asm)
|
183
|
+
return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
|
184
|
+
if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
|
185
|
+
acc = rep.nil? ? '' : rep.first
|
186
|
+
else
|
187
|
+
acc = asm
|
188
|
+
end
|
189
|
+
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
190
|
+
"#{r['#organism']}_#{acc}".miga_name
|
191
|
+
end
|
192
|
+
|
193
|
+
def remote_list_url
|
111
194
|
url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
|
112
195
|
url_param = {
|
113
|
-
q: '[display()].'
|
114
|
-
'from(GenomeAssemblies).'
|
115
|
-
'usingschema(/schema/GenomeAssemblies).'
|
116
|
-
'matching(tab==["Prokaryotes"] and q=="'
|
117
|
-
|
118
|
-
|
119
|
-
'
|
120
|
-
|
196
|
+
q: '[display()].' \
|
197
|
+
'from(GenomeAssemblies).' \
|
198
|
+
'usingschema(/schema/GenomeAssemblies).' \
|
199
|
+
'matching(tab==["Prokaryotes"] and q=="' \
|
200
|
+
"#{cli[:taxon].tr('"', "'")}\"",
|
201
|
+
fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
|
202
|
+
'level|level,ftp_path_genbank|ftp_path_genbank,' \
|
203
|
+
'release_date|release_date,strain|strain',
|
204
|
+
nolimit: 'on'
|
121
205
|
}
|
122
206
|
if cli[:reference]
|
123
207
|
url_param[:q] += ' and refseq_category==["representative"]'
|
@@ -131,95 +215,53 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
|
|
131
215
|
url_param[:q] += ' and level==[' + status + ']'
|
132
216
|
end
|
133
217
|
url_param[:q] += ')'
|
134
|
-
|
135
|
-
|
136
|
-
lineno = 0
|
137
|
-
doc = RemoteDataset.download_url(url)
|
138
|
-
CSV.parse(doc, headers: true).each do |r|
|
139
|
-
asm = r['assembly']
|
140
|
-
next if asm.nil? or asm.empty? or asm == '-'
|
141
|
-
next unless r['ftp_path_genbank']
|
142
|
-
|
143
|
-
# Get replicons
|
144
|
-
rep = r['replicons'].nil? ? nil : r['replicons'].
|
145
|
-
split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
|
146
|
-
|
147
|
-
# Set name
|
148
|
-
if cli[:legacy_name] and cli[:reference]
|
149
|
-
n = r['#organism'].miga_name
|
150
|
-
else
|
151
|
-
if cli[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
|
152
|
-
acc = rep.nil? ? '' : rep.first
|
153
|
-
else
|
154
|
-
acc = asm
|
155
|
-
end
|
156
|
-
acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
|
157
|
-
n = "#{r['#organism']}_#{acc}".miga_name
|
158
|
-
end
|
159
|
-
|
160
|
-
# Register for download
|
161
|
-
fna_url = r['ftp_path_genbank'] + '/' +
|
162
|
-
File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
|
163
|
-
ds[n] = {
|
164
|
-
ids: [fna_url], db: :assembly_gz, universe: :web,
|
165
|
-
md: {
|
166
|
-
type: :genome, ncbi_asm: asm, strain: r['strain']
|
167
|
-
}
|
168
|
-
}
|
169
|
-
ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
|
170
|
-
ds[n][:md][:release_date] =
|
171
|
-
Time.parse(r['release_date']).to_s unless r['release_date'].nil?
|
172
|
-
end
|
218
|
+
url_base + URI.encode_www_form(url_param)
|
219
|
+
end
|
173
220
|
|
174
|
-
|
221
|
+
def discard_blacklisted(ds)
|
175
222
|
unless cli[:blacklist].nil?
|
176
223
|
cli.say "Discarding datasets in #{cli[:blacklist]}"
|
177
|
-
File.readlines(cli[:blacklist])
|
178
|
-
|
224
|
+
File.readlines(cli[:blacklist])
|
225
|
+
.select { |i| i !~ /^#/ }
|
226
|
+
.map(&:chomp)
|
227
|
+
.each { |i| ds.delete i }
|
179
228
|
end
|
229
|
+
ds
|
230
|
+
end
|
180
231
|
|
181
|
-
|
232
|
+
def download_entries(ds, p)
|
182
233
|
cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
|
183
234
|
p.do_not_save = true if cli[:save_every] != 1
|
184
235
|
ignore = !cli[:ignore_until].nil?
|
236
|
+
downloaded = 0
|
237
|
+
d = []
|
185
238
|
ds.each do |name, body|
|
186
239
|
d << name
|
187
240
|
cli.puts name
|
188
241
|
ignore = false if ignore && name == cli[:ignore_until]
|
189
|
-
next if ignore
|
190
|
-
next if p.dataset(name).nil? == cli[:get_md]
|
242
|
+
next if ignore || p.dataset(name).nil? == cli[:get_md]
|
191
243
|
downloaded += 1
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
196
|
-
if cli[:get_md]
|
197
|
-
cli.say ' Updating dataset'
|
198
|
-
rd.update_metadata(p.dataset(name), body[:md])
|
199
|
-
else
|
200
|
-
cli.say ' Creating dataset'
|
201
|
-
rd.save_to(p, name, !cli[:query], body[:md])
|
202
|
-
p.add_dataset(name)
|
244
|
+
unless cli[:dry]
|
245
|
+
save_entry(name, body, p)
|
246
|
+
p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
|
203
247
|
end
|
204
|
-
p.save! if cli[:save_every] > 1 and (downloaded % cli[:save_every]) == 0
|
205
248
|
end
|
206
|
-
|
207
249
|
p.do_not_save = false
|
208
250
|
p.save! if cli[:save_every] != 1
|
251
|
+
[d, downloaded]
|
252
|
+
end
|
209
253
|
|
210
|
-
|
211
|
-
cli.say
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
unlink.each { |i| p.unlink_dataset(i).remove! }
|
222
|
-
cli.say "Datasets unlinked: #{unlink.size}"
|
254
|
+
def save_entry(name, body, p)
|
255
|
+
cli.say ' Locating remote dataset'
|
256
|
+
body[:md][:metadata_only] = true if cli[:only_md]
|
257
|
+
rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
|
258
|
+
if cli[:get_md]
|
259
|
+
cli.say ' Updating dataset'
|
260
|
+
rd.update_metadata(p.dataset(name), body[:md])
|
261
|
+
else
|
262
|
+
cli.say ' Creating dataset'
|
263
|
+
rd.save_to(p, name, !cli[:query], body[:md])
|
264
|
+
cli.add_metadata(p.add_dataset(name)).save
|
223
265
|
end
|
224
266
|
end
|
225
267
|
end
|