miga-base 0.4.3.0 → 0.5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/lib/miga/cli.rb +43 -223
  4. data/lib/miga/cli/action/add.rb +91 -62
  5. data/lib/miga/cli/action/classify_wf.rb +97 -0
  6. data/lib/miga/cli/action/daemon.rb +14 -10
  7. data/lib/miga/cli/action/derep_wf.rb +95 -0
  8. data/lib/miga/cli/action/doctor.rb +83 -55
  9. data/lib/miga/cli/action/get.rb +68 -52
  10. data/lib/miga/cli/action/get_db.rb +206 -0
  11. data/lib/miga/cli/action/index_wf.rb +31 -0
  12. data/lib/miga/cli/action/init.rb +115 -190
  13. data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
  14. data/lib/miga/cli/action/ls.rb +20 -11
  15. data/lib/miga/cli/action/ncbi_get.rb +199 -157
  16. data/lib/miga/cli/action/preproc_wf.rb +46 -0
  17. data/lib/miga/cli/action/quality_wf.rb +45 -0
  18. data/lib/miga/cli/action/stats.rb +147 -99
  19. data/lib/miga/cli/action/summary.rb +10 -4
  20. data/lib/miga/cli/action/tax_dist.rb +61 -46
  21. data/lib/miga/cli/action/tax_test.rb +46 -39
  22. data/lib/miga/cli/action/wf.rb +178 -0
  23. data/lib/miga/cli/base.rb +11 -0
  24. data/lib/miga/cli/objects_helper.rb +88 -0
  25. data/lib/miga/cli/opt_helper.rb +160 -0
  26. data/lib/miga/daemon.rb +7 -4
  27. data/lib/miga/dataset/base.rb +5 -5
  28. data/lib/miga/project/base.rb +4 -4
  29. data/lib/miga/project/result.rb +2 -1
  30. data/lib/miga/remote_dataset/base.rb +5 -5
  31. data/lib/miga/remote_dataset/download.rb +1 -1
  32. data/lib/miga/version.rb +3 -3
  33. data/scripts/cds.bash +3 -1
  34. data/scripts/essential_genes.bash +1 -0
  35. data/scripts/stats.bash +1 -1
  36. data/scripts/trimmed_fasta.bash +5 -3
  37. data/utils/distance/runner.rb +3 -0
  38. data/utils/distance/temporal.rb +10 -1
  39. data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
  41. data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
  42. data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
  43. data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
  44. data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
  45. data/utils/enveomics/Scripts/SRA.download.bash +1 -1
  46. data/utils/enveomics/Scripts/aai.rb +163 -128
  47. data/utils/enveomics/build_enveomics_r.bash +11 -10
  48. data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
  49. data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
  50. data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
  51. data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
  52. data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
  53. data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
  54. data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
  55. data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
  56. data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
  57. data/utils/enveomics/enveomics.R/R/utils.R +31 -15
  58. data/utils/enveomics/enveomics.R/README.md +7 -0
  59. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
  60. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
  61. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
  62. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
  63. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
  64. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
  65. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
  66. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
  67. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
  68. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
  69. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
  70. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
  71. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
  72. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
  73. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
  74. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
  75. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
  76. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
  77. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
  78. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
  79. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
  80. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
  81. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
  82. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
  83. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
  84. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
  100. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
  101. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
  102. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
  103. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
  104. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
  105. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
  106. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
  107. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
  108. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
  109. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
  110. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
  111. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
  112. data/utils/find-medoid.R +3 -2
  113. data/utils/representatives.rb +5 -3
  114. data/utils/subclade/pipeline.rb +22 -11
  115. data/utils/subclade/runner.rb +5 -1
  116. data/utils/subclades-compile.rb +1 -1
  117. data/utils/subclades.R +9 -3
  118. metadata +15 -4
  119. data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
  120. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -0,0 +1,124 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ ##
5
+ # Helper module with daemon configuration functions for MiGA::Cli::Action::Init
6
+ module MiGA::Cli::Action::Init::DaemonHelper
7
+ def configure_daemon
8
+ cli.puts 'Default daemon configuration:'
9
+ daemon_f = File.expand_path('.miga_daemon.json', ENV['HOME'])
10
+ unless File.exist?(daemon_f) and cli.ask_user(
11
+ 'A template daemon already exists, do you want to preserve it?',
12
+ 'yes', %w(yes no)) == 'yes'
13
+ v = {created: Time.now.to_s, updated: Time.now.to_s}
14
+ v[:type] = cli.ask_user(
15
+ 'Please select the type of daemon you want to setup',
16
+ cli[:dtype], %w(bash qsub msub slurm))
17
+ case v[:type]
18
+ when 'bash'
19
+ v = configure_bash_daemon(v)
20
+ when 'slurm'
21
+ v = configure_slurm_daemon(v)
22
+ else # [qm]sub
23
+ v = configure_qsub_msub_daemon(v)
24
+ end
25
+ File.open(daemon_f, 'w') { |fh| fh.puts JSON.pretty_generate(v) }
26
+ end
27
+ cli.puts ''
28
+ end
29
+
30
+ def configure_bash_daemon(v)
31
+ v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '2').to_i
32
+ v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '6').to_i
33
+ v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
34
+ cli.puts 'Setting up internal daemon defaults.'
35
+ cli.puts 'If you don\'t understand this just leave default values:'
36
+ v[:cmd] = cli.ask_user(
37
+ "How should I launch tasks?\n %1$s: script path, " \
38
+ "%2$s: variables, %3$d: CPUs, %4$s: log file, %5$s: task name.\n",
39
+ "%2$s '%1$s' > '%4$s' 2>&1")
40
+ v[:var] = cli.ask_user(
41
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
42
+ "%1$s=%2$s")
43
+ v[:varsep] = cli.ask_user('What should I use to separate variables?', ' ')
44
+ v[:alive] = cli.ask_user(
45
+ "How can I know that a process is still alive?\n %1$s: PID, " \
46
+ "output should be 1 for running and 0 for non-running.\n",
47
+ "ps -p '%1$s'|tail -n+2|wc -l")
48
+ v[:kill] = cli.ask_user(
49
+ "How should I terminate tasks?\n %s: process ID.", "kill -9 '%s'")
50
+ v
51
+ end
52
+
53
+ def configure_slurm_daemon(v)
54
+ queue = cli.ask_user('What queue should I use?', nil, nil, true)
55
+ v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '150').to_i
56
+ v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '300').to_i
57
+ v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
58
+ cli.puts 'Setting up internal daemon defaults'
59
+ cli.puts 'If you don\'t understand this just leave default values:'
60
+ v[:cmd] = cli.ask_user(
61
+ "How should I launch tasks?\n %1$s: script path, " \
62
+ "%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
63
+ "%2$s sbatch --partition='#{queue}' --export=ALL " \
64
+ "--nodes=1 --ntasks-per-node=%3$d --output='%4$s' " \
65
+ "--job-name='%5$s' --mem=9G --time=12:00:00 %1$s " \
66
+ "| perl -pe 's/.* //'")
67
+ v[:var] = cli.ask_user(
68
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
69
+ "%1$s=%2$s")
70
+ v[:varsep] = cli.ask_user(
71
+ 'What should I use to separate variables?', ' ')
72
+ v[:alive] = cli.ask_user(
73
+ "How can I know that a process is still alive?\n %1$s: job id, " \
74
+ "output should be 1 for running and 0 for non-running.\n",
75
+ "squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " \
76
+ "| tail -n 1 | wc -l")
77
+ v[:kill] = cli.ask_user(
78
+ "How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
79
+ v
80
+ end
81
+
82
+ def configure_qsub_msub_daemon
83
+ queue = cli.ask_user('What queue should I use?', nil, nil, true)
84
+ v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '150').to_i
85
+ v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '300').to_i
86
+ v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
87
+ cli.puts 'Setting up internal daemon defaults.'
88
+ cli.puts 'If you don\'t understand this just leave default values:'
89
+ v[:cmd] = cli.ask_user(
90
+ "How should I launch tasks?\n %1$s: script path, " \
91
+ "%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
92
+ "#{v[:type]} -q '#{queue}' -v '%2$s' -l nodes=1:ppn=%3$d %1$s " \
93
+ "-j oe -o '%4$s' -N '%5$s' -l mem=9g -l walltime=12:00:00 " \
94
+ "| grep .")
95
+ v[:var] = cli.ask_user(
96
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
97
+ "%1$s=%2$s")
98
+ v[:varsep] = cli.ask_user(
99
+ 'What should I use to separate variables?', ',')
100
+ if v[:type] == 'qsub'
101
+ v[:alive] = cli.ask_user(
102
+ "How can I know that a process is still alive?\n " \
103
+ "%1$s: job id, output should be 1 for running and " \
104
+ "0 for non-running.\n",
105
+ "qstat -f '%1$s'|grep ' job_state ='|perl -pe 's/.*= //'" \
106
+ "|grep '[^C]'|tail -n1|wc -l|awk '{print $1}'")
107
+ v[:kill] = cli.ask_user(
108
+ "How should I terminate tasks?\n %s: process ID.", "qdel '%s'")
109
+ else # msub
110
+ v[:alive] = cli.ask_user(
111
+ "How can I know that a process is still alive?\n " \
112
+ "%1$s: job id, output should be 1 for running and " \
113
+ "0 for non-running.\n",
114
+ "checkjob '%1$s'|grep '^State:'|perl -pe 's/.*: //'" \
115
+ "|grep 'Deferred\\|Hold\\|Idle\\|Starting\\|Running\\|Blocked'" \
116
+ "|tail -n1|wc -l|awk '{print $1}'")
117
+ v[:kill] = cli.ask_user(
118
+ "How should I terminate tasks?\n %s: process ID.",
119
+ "canceljob '%s'")
120
+ end
121
+ v
122
+ end
123
+
124
+ end
@@ -6,50 +6,59 @@ require 'miga/cli/action'
6
6
  class MiGA::Cli::Action::Ls < MiGA::Cli::Action
7
7
 
8
8
  def parse_cli
9
- cli.defaults = {info: false, processing: false, silent: false}
9
+ cli.defaults = { info: false, processing: false, silent: false }
10
10
  cli.parse do |opt|
11
11
  cli.opt_object(opt, [:project, :dataset_opt])
12
12
  cli.opt_filter_datasets(opt)
13
13
  opt.on(
14
14
  '-i', '--info',
15
15
  'Print additional information on each dataset'
16
- ){ |v| cli[:info] = v }
16
+ ) { |v| cli[:info] = v }
17
17
  opt.on(
18
18
  '-p', '--processing',
19
19
  'Print information on processing advance'
20
- ){ |v| cli[:processing] = v }
20
+ ) { |v| cli[:processing] = v }
21
21
  opt.on(
22
22
  '-m', '--metadata STRING',
23
23
  'Print name and metadata field only',
24
24
  'If set, ignores -i and assumes --tab'
25
- ){ |v| cli[:datum] = v }
25
+ ) { |v| cli[:datum] = v }
26
26
  opt.on(
27
27
  '--tab',
28
28
  'Return a tab-delimited table'
29
- ){ |v| cli[:tabular] = v }
29
+ ) { |v| cli[:tabular] = v }
30
+ opt.on(
31
+ '-o', '--output PATH',
32
+ 'Create output file instead of returning to STDOUT'
33
+ ) { |v| cli[:output] = v }
30
34
  opt.on(
31
35
  '-s', '--silent',
32
36
  'No output and exit with non-zero status if the dataset list is empty'
33
- ){ |v| cli[:silent] = v }
37
+ ) { |v| cli[:silent] = v }
34
38
  end
35
39
  end
36
40
 
37
41
  def perform
38
42
  ds = cli.load_and_filter_datasets(cli[:silent])
39
43
  exit(ds.empty? ? 1 : 0) if cli[:silent]
44
+ io = cli[:output].nil? ? $stdout : File.open(cli[:output], 'w')
40
45
  if !cli[:datum].nil?
41
46
  ds.each do |d|
42
47
  v = d.metadata[cli[:datum]]
43
- puts "#{d.name}\t#{v.nil? ? '?' : v}"
48
+ cli.puts(io, "#{d.name}\t#{v.nil? ? '?' : v}")
44
49
  end
45
50
  elsif cli[:info]
46
- cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info })
51
+ cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info }, io)
47
52
  elsif cli[:processing]
48
53
  comp = %w[- done queued]
49
- cli.table([:name] + MiGA::Dataset.PREPROCESSING_TASKS,
50
- ds.map { |d| [d.name] + d.profile_advance.map { |i| comp[i] } })
54
+ cli.table(
55
+ [:name] + MiGA::Dataset.PREPROCESSING_TASKS,
56
+ ds.map { |d| [d.name] + d.profile_advance.map { |i| comp[i] } },
57
+ io
58
+ )
51
59
  else
52
- ds.each { |d| cli.puts d.name }
60
+ ds.each { |d| cli.puts(io, d.name) }
53
61
  end
62
+ io.close unless cli[:output].nil?
54
63
  end
55
64
  end
@@ -6,118 +6,202 @@ require 'miga/remote_dataset'
6
6
  require 'csv'
7
7
 
8
8
  class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
9
-
10
9
  def parse_cli
11
- cli.defaults = {query: false, unlink: false,
10
+ cli.defaults = {
11
+ query: false, unlink: false,
12
12
  reference: false, legacy_name: false,
13
13
  complete: false, chromosome: false,
14
14
  scaffold: false, contig: false, add_version: true, dry: false,
15
- get_md: false, only_md: false, save_every: 1}
15
+ get_md: false, only_md: false, save_every: 1
16
+ }
16
17
  cli.parse do |opt|
17
18
  cli.opt_object(opt, [:project])
18
19
  opt.on(
19
20
  '-T', '--taxon STRING',
20
21
  '(Mandatory unless --reference) Taxon name (e.g., a species binomial)'
21
- ){ |v| cli[:taxon] = v }
22
- opt.on('--reference',
23
- 'Download all reference genomes (ignore any other status)'
24
- ){ |v| cli[:reference] = v }
25
- opt.on(
26
- '--complete',
27
- 'Download complete genomes'
28
- ){ |v| cli[:complete] = v }
29
- opt.on('--chromosome',
30
- 'Download complete chromosomes'
31
- ){ |v| cli[:chromosome] = v }
32
- opt.on(
33
- '--scaffold',
34
- 'Download genomes in scaffolds'
35
- ){ |v| cli[:scaffold] = v }
36
- opt.on(
37
- '--contig',
38
- 'Download genomes in contigs'
39
- ){ |v| cli[:contig] = v }
40
- opt.on(
41
- '--all',
42
- 'Download all genomes (in any status)') do
43
- cli[:complete] = true
44
- cli[:chromosome] = true
45
- cli[:scaffold] = true
46
- cli[:contig] = true
47
- end
48
- opt.on(
49
- '--no-version-name',
50
- 'Do not add sequence version to the dataset name',
51
- 'Only affects --complete and --chromosome'
52
- ){ |v| cli[:add_version] = v }
53
- opt.on(
54
- '--legacy-name',
55
- 'Use dataset names based on chromosome entries instead of assembly'
56
- ){ |v| cli[:legacy_name] = v }
57
- opt.on('--blacklist PATH',
58
- 'A file with dataset names to blacklist'
59
- ){ |v| cli[:blacklist] = v }
60
- opt.on(
61
- '--dry',
62
- 'Do not download or save the datasets'
63
- ){ |v| cli[:dry] = v }
64
- opt.on(
65
- '--ignore-until STRING',
66
- 'Ignores all datasets until a name is found (useful for large reruns)'
67
- ){ |v| cli[:ignore_until] = v }
68
- opt.on(
69
- '--get-metadata',
70
- 'Only download and update metadata for existing datasets'
71
- ){ |v| cli[:get_md] = v }
72
- opt.on('--only-metadata',
73
- 'Create datasets without input data but retrieve all metadata'
74
- ){ |v| cli[:only_md] = v }
75
- opt.on(
76
- '--save-every INT', Integer,
77
- 'Save project every this many downloaded datasets',
78
- 'If zero, it saves the project only once upon completion',
79
- "By default: #{cli[:save_every]}"
80
- ){ |v| cli[:save_every] = v }
22
+ ) { |v| cli[:taxon] = v }
81
23
  opt.on(
82
- '-q', '--query',
83
- 'Register the datasets as queries, not reference datasets'
84
- ){ |v| cli[:query] = v }
85
- opt.on(
86
- '-u', '--unlink',
87
- 'Unlink all datasets in the project missing from the download list'
88
- ){ |v| cli[:unlink] = v }
89
- opt.on('-R', '--remote-list PATH',
90
- 'Path to an output file with the list of all datasets listed remotely'
91
- ){ |v| cli[:remote_list] = v }
24
+ '-m', '--metadata STRING',
25
+ 'Metadata as key-value pairs separated by = and delimited by comma',
26
+ 'Values are saved as strings except for booleans (true / false) or nil'
27
+ ) { |v| cli[:metadata] = v }
28
+ cli_task_flags(opt)
29
+ cli_name_modifiers(opt)
30
+ cli_filters(opt)
31
+ cli_save_actions(opt)
92
32
  opt.on(
93
33
  '--api-key STRING',
94
34
  'NCBI API key'
95
- ){ |v| ENV['NCBI_API_KEY'] = v }
35
+ ) { |v| ENV['NCBI_API_KEY'] = v }
96
36
  end
97
37
  end
98
38
 
99
39
  def perform
40
+ sanitize_cli
41
+ p = cli.load_project
42
+ ds = remote_list
43
+ ds = discard_blacklisted(ds)
44
+ d, downloaded = download_entries(ds, p)
45
+
46
+ # Finalize
47
+ cli.say "Datasets listed: #{d.size}"
48
+ act = cli[:dry] ? 'to download' : 'downloaded'
49
+ cli.say "Datasets #{act}: #{downloaded}"
50
+ unless cli[:remote_list].nil?
51
+ File.open(cli[:remote_list], 'w') do |fh|
52
+ d.each { |i| fh.puts i }
53
+ end
54
+ end
55
+ return unless cli[:unlink]
56
+ unlink = p.dataset_names - d
57
+ unlink.each { |i| p.unlink_dataset(i).remove! }
58
+ cli.say "Datasets unlinked: #{unlink.size}"
59
+ end
60
+
61
+ private
62
+
63
+ def cli_task_flags(opt)
64
+ cli.opt_flag(
65
+ opt, 'reference',
66
+ 'Download all reference genomes (ignore any other status)'
67
+ )
68
+ cli.opt_flag(opt, 'complete', 'Download complete genomes')
69
+ cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes')
70
+ cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds')
71
+ cli.opt_flag(opt, 'contig', 'Download genomes in contigs')
72
+ opt.on(
73
+ '--all',
74
+ 'Download all genomes (in any status)'
75
+ ) do
76
+ cli[:complete] = true
77
+ cli[:chromosome] = true
78
+ cli[:scaffold] = true
79
+ cli[:contig] = true
80
+ end
81
+ end
82
+
83
+ def cli_name_modifiers(opt)
84
+ opt.on(
85
+ '--no-version-name',
86
+ 'Do not add sequence version to the dataset name',
87
+ 'Only affects --complete and --chromosome'
88
+ ) { |v| cli[:add_version] = v }
89
+ cli.opt_flag(
90
+ opt, 'legacy-name',
91
+ 'Use dataset names based on chromosome entries instead of assembly',
92
+ :legacy_name
93
+ )
94
+ end
95
+
96
+ def cli_filters(opt)
97
+ opt.on(
98
+ '--blacklist PATH',
99
+ 'A file with dataset names to blacklist'
100
+ ) { |v| cli[:blacklist] = v }
101
+ cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
102
+ opt.on(
103
+ '--ignore-until STRING',
104
+ 'Ignores all datasets until a name is found (useful for large reruns)'
105
+ ) { |v| cli[:ignore_until] = v }
106
+ cli.opt_flag(
107
+ opt, 'get-metadata',
108
+ 'Only download and update metadata for existing datasets', :get_md)
109
+ end
110
+
111
+ def cli_save_actions(opt)
112
+ cli.opt_flag(
113
+ opt, 'only-metadata',
114
+ 'Create datasets without input data but retrieve all metadata',
115
+ :only_md)
116
+ opt.on(
117
+ '--save-every INT', Integer,
118
+ 'Save project every this many downloaded datasets',
119
+ 'If zero, it saves the project only once upon completion',
120
+ "By default: #{cli[:save_every]}"
121
+ ) { |v| cli[:save_every] = v }
122
+ opt.on(
123
+ '-q', '--query',
124
+ 'Register the datasets as queries, not reference datasets'
125
+ ) { |v| cli[:query] = v }
126
+ opt.on(
127
+ '-u', '--unlink',
128
+ 'Unlink all datasets in the project missing from the download list'
129
+ ) { |v| cli[:unlink] = v }
130
+ opt.on(
131
+ '-R', '--remote-list PATH',
132
+ 'Path to an output file with the list of all datasets listed remotely'
133
+ ) { |v| cli[:remote_list] = v }
134
+ end
135
+
136
+ def sanitize_cli
100
137
  cli.ensure_par(taxon: '-T') unless cli[:reference]
101
- unless %w[reference complete chromosome scaffold contig].any?{ |i| cli[i.to_sym] }
138
+ tasks = %w[reference complete chromosome scaffold contig]
139
+ unless tasks.any? { |i| cli[i.to_sym] }
102
140
  raise 'No action requested: pick at least one type of genome'
103
141
  end
104
142
  cli[:save_every] = 1 if cli[:dry]
143
+ end
105
144
 
106
- p = cli.load_project
107
- d = []
145
+ def remote_list
146
+ cli.say 'Downloading genome list'
108
147
  ds = {}
109
- downloaded = 0
148
+ url = remote_list_url
149
+ doc = RemoteDataset.download_url(url)
150
+ CSV.parse(doc, headers: true).each do |r|
151
+ asm = r['assembly']
152
+ next if asm.nil? || asm.empty? || asm == '-'
153
+ next unless r['ftp_path_genbank']
154
+ rep = remote_row_replicons(r)
155
+ n = remote_row_name(r, rep, asm)
156
+
157
+ # Register for download
158
+ fna_url = '%s/%s_genomic.fna.gz' %
159
+ [r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])]
160
+ ds[n] = {
161
+ ids: [fna_url], db: :assembly_gz, universe: :web,
162
+ md: {
163
+ type: :genome, ncbi_asm: asm, strain: r['strain']
164
+ }
165
+ }
166
+ ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
167
+ unless r['release_date'].nil?
168
+ ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
169
+ end
170
+ end
171
+ ds
172
+ end
110
173
 
174
+ def remote_row_replicons(r)
175
+ return if r['replicons'].nil?
176
+ r['replicons']
177
+ .split('; ')
178
+ .map { |i| i.gsub(/.*:/, '') }
179
+ .map { |i| i.gsub(%r{/.*}, '') }
180
+ end
181
+
182
+ def remote_row_name(r, rep, asm)
183
+ return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
184
+ if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
185
+ acc = rep.nil? ? '' : rep.first
186
+ else
187
+ acc = asm
188
+ end
189
+ acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
190
+ "#{r['#organism']}_#{acc}".miga_name
191
+ end
192
+
193
+ def remote_list_url
111
194
  url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
112
195
  url_param = {
113
- q: '[display()].' +
114
- 'from(GenomeAssemblies).' +
115
- 'usingschema(/schema/GenomeAssemblies).' +
116
- 'matching(tab==["Prokaryotes"] and q=="' + cli[:taxon].tr('"',"'") + '"',
117
- fields: 'organism|organism,assembly|assembly,replicons|replicons,' +
118
- 'level|level,ftp_path_genbank|ftp_path_genbank,release_date|release_date,' +
119
- 'strain|strain',
120
- nolimit: 'on',
196
+ q: '[display()].' \
197
+ 'from(GenomeAssemblies).' \
198
+ 'usingschema(/schema/GenomeAssemblies).' \
199
+ 'matching(tab==["Prokaryotes"] and q=="' \
200
+ "#{cli[:taxon].tr('"', "'")}\"",
201
+ fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
202
+ 'level|level,ftp_path_genbank|ftp_path_genbank,' \
203
+ 'release_date|release_date,strain|strain',
204
+ nolimit: 'on'
121
205
  }
122
206
  if cli[:reference]
123
207
  url_param[:q] += ' and refseq_category==["representative"]'
@@ -131,95 +215,53 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
131
215
  url_param[:q] += ' and level==[' + status + ']'
132
216
  end
133
217
  url_param[:q] += ')'
134
- url = url_base + URI.encode_www_form(url_param)
135
- cli.say 'Downloading genome list'
136
- lineno = 0
137
- doc = RemoteDataset.download_url(url)
138
- CSV.parse(doc, headers: true).each do |r|
139
- asm = r['assembly']
140
- next if asm.nil? or asm.empty? or asm == '-'
141
- next unless r['ftp_path_genbank']
142
-
143
- # Get replicons
144
- rep = r['replicons'].nil? ? nil : r['replicons'].
145
- split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
146
-
147
- # Set name
148
- if cli[:legacy_name] and cli[:reference]
149
- n = r['#organism'].miga_name
150
- else
151
- if cli[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
152
- acc = rep.nil? ? '' : rep.first
153
- else
154
- acc = asm
155
- end
156
- acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
157
- n = "#{r['#organism']}_#{acc}".miga_name
158
- end
159
-
160
- # Register for download
161
- fna_url = r['ftp_path_genbank'] + '/' +
162
- File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
163
- ds[n] = {
164
- ids: [fna_url], db: :assembly_gz, universe: :web,
165
- md: {
166
- type: :genome, ncbi_asm: asm, strain: r['strain']
167
- }
168
- }
169
- ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
170
- ds[n][:md][:release_date] =
171
- Time.parse(r['release_date']).to_s unless r['release_date'].nil?
172
- end
218
+ url_base + URI.encode_www_form(url_param)
219
+ end
173
220
 
174
- # Discard blacklisted
221
+ def discard_blacklisted(ds)
175
222
  unless cli[:blacklist].nil?
176
223
  cli.say "Discarding datasets in #{cli[:blacklist]}"
177
- File.readlines(cli[:blacklist]).
178
- select{ |i| i !~ /^#/ }.map(&:chomp).each{ |i| ds.delete i }
224
+ File.readlines(cli[:blacklist])
225
+ .select { |i| i !~ /^#/ }
226
+ .map(&:chomp)
227
+ .each { |i| ds.delete i }
179
228
  end
229
+ ds
230
+ end
180
231
 
181
- # Download entries
232
+ def download_entries(ds, p)
182
233
  cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
183
234
  p.do_not_save = true if cli[:save_every] != 1
184
235
  ignore = !cli[:ignore_until].nil?
236
+ downloaded = 0
237
+ d = []
185
238
  ds.each do |name, body|
186
239
  d << name
187
240
  cli.puts name
188
241
  ignore = false if ignore && name == cli[:ignore_until]
189
- next if ignore
190
- next if p.dataset(name).nil? == cli[:get_md]
242
+ next if ignore || p.dataset(name).nil? == cli[:get_md]
191
243
  downloaded += 1
192
- next if cli[:dry]
193
- cli.say ' Locating remote dataset'
194
- body[:md][:metadata_only] = true if cli[:only_md]
195
- rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
196
- if cli[:get_md]
197
- cli.say ' Updating dataset'
198
- rd.update_metadata(p.dataset(name), body[:md])
199
- else
200
- cli.say ' Creating dataset'
201
- rd.save_to(p, name, !cli[:query], body[:md])
202
- p.add_dataset(name)
244
+ unless cli[:dry]
245
+ save_entry(name, body, p)
246
+ p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
203
247
  end
204
- p.save! if cli[:save_every] > 1 and (downloaded % cli[:save_every]) == 0
205
248
  end
206
-
207
249
  p.do_not_save = false
208
250
  p.save! if cli[:save_every] != 1
251
+ [d, downloaded]
252
+ end
209
253
 
210
- # Finalize
211
- cli.say "Datasets listed: #{d.size}"
212
- cli.say "Datasets #{cli[:dry] ? 'to download' : 'downloaded'}: " +
213
- downloaded.to_s
214
- unless cli[:remote_list].nil?
215
- File.open(cli[:remote_list], 'w') do |fh|
216
- d.each { |i| fh.puts i }
217
- end
218
- end
219
- if cli[:unlink]
220
- unlink = p.dataset_names - d
221
- unlink.each { |i| p.unlink_dataset(i).remove! }
222
- cli.say "Datasets unlinked: #{unlink.size}"
254
+ def save_entry(name, body, p)
255
+ cli.say ' Locating remote dataset'
256
+ body[:md][:metadata_only] = true if cli[:only_md]
257
+ rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
258
+ if cli[:get_md]
259
+ cli.say ' Updating dataset'
260
+ rd.update_metadata(p.dataset(name), body[:md])
261
+ else
262
+ cli.say ' Creating dataset'
263
+ rd.save_to(p, name, !cli[:query], body[:md])
264
+ cli.add_metadata(p.add_dataset(name)).save
223
265
  end
224
266
  end
225
267
  end