miga-base 0.4.3.0 → 0.5.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/lib/miga/cli.rb +43 -223
  4. data/lib/miga/cli/action/add.rb +91 -62
  5. data/lib/miga/cli/action/classify_wf.rb +97 -0
  6. data/lib/miga/cli/action/daemon.rb +14 -10
  7. data/lib/miga/cli/action/derep_wf.rb +95 -0
  8. data/lib/miga/cli/action/doctor.rb +83 -55
  9. data/lib/miga/cli/action/get.rb +68 -52
  10. data/lib/miga/cli/action/get_db.rb +206 -0
  11. data/lib/miga/cli/action/index_wf.rb +31 -0
  12. data/lib/miga/cli/action/init.rb +115 -190
  13. data/lib/miga/cli/action/init/daemon_helper.rb +124 -0
  14. data/lib/miga/cli/action/ls.rb +20 -11
  15. data/lib/miga/cli/action/ncbi_get.rb +199 -157
  16. data/lib/miga/cli/action/preproc_wf.rb +46 -0
  17. data/lib/miga/cli/action/quality_wf.rb +45 -0
  18. data/lib/miga/cli/action/stats.rb +147 -99
  19. data/lib/miga/cli/action/summary.rb +10 -4
  20. data/lib/miga/cli/action/tax_dist.rb +61 -46
  21. data/lib/miga/cli/action/tax_test.rb +46 -39
  22. data/lib/miga/cli/action/wf.rb +178 -0
  23. data/lib/miga/cli/base.rb +11 -0
  24. data/lib/miga/cli/objects_helper.rb +88 -0
  25. data/lib/miga/cli/opt_helper.rb +160 -0
  26. data/lib/miga/daemon.rb +7 -4
  27. data/lib/miga/dataset/base.rb +5 -5
  28. data/lib/miga/project/base.rb +4 -4
  29. data/lib/miga/project/result.rb +2 -1
  30. data/lib/miga/remote_dataset/base.rb +5 -5
  31. data/lib/miga/remote_dataset/download.rb +1 -1
  32. data/lib/miga/version.rb +3 -3
  33. data/scripts/cds.bash +3 -1
  34. data/scripts/essential_genes.bash +1 -0
  35. data/scripts/stats.bash +1 -1
  36. data/scripts/trimmed_fasta.bash +5 -3
  37. data/utils/distance/runner.rb +3 -0
  38. data/utils/distance/temporal.rb +10 -1
  39. data/utils/enveomics/Manifest/Tasks/fasta.json +5 -0
  40. data/utils/enveomics/Manifest/Tasks/sequence-identity.json +7 -0
  41. data/utils/enveomics/Scripts/BlastTab.addlen.rb +33 -31
  42. data/utils/enveomics/Scripts/FastA.tag.rb +42 -41
  43. data/utils/enveomics/Scripts/HMM.essential.rb +85 -55
  44. data/utils/enveomics/Scripts/HMM.haai.rb +29 -20
  45. data/utils/enveomics/Scripts/SRA.download.bash +1 -1
  46. data/utils/enveomics/Scripts/aai.rb +163 -128
  47. data/utils/enveomics/build_enveomics_r.bash +11 -10
  48. data/utils/enveomics/enveomics.R/DESCRIPTION +3 -2
  49. data/utils/enveomics/enveomics.R/R/autoprune.R +141 -107
  50. data/utils/enveomics/enveomics.R/R/barplot.R +105 -86
  51. data/utils/enveomics/enveomics.R/R/cliopts.R +131 -115
  52. data/utils/enveomics/enveomics.R/R/df2dist.R +144 -106
  53. data/utils/enveomics/enveomics.R/R/growthcurve.R +201 -133
  54. data/utils/enveomics/enveomics.R/R/recplot.R +350 -315
  55. data/utils/enveomics/enveomics.R/R/recplot2.R +1334 -914
  56. data/utils/enveomics/enveomics.R/R/tribs.R +521 -361
  57. data/utils/enveomics/enveomics.R/R/utils.R +31 -15
  58. data/utils/enveomics/enveomics.R/README.md +7 -0
  59. data/utils/enveomics/enveomics.R/man/cash-enve.GrowthCurve-method.Rd +17 -0
  60. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2-method.Rd +17 -0
  61. data/utils/enveomics/enveomics.R/man/cash-enve.RecPlot2.Peak-method.Rd +17 -0
  62. data/utils/enveomics/enveomics.R/man/enve.GrowthCurve-class.Rd +16 -21
  63. data/utils/enveomics/enveomics.R/man/enve.TRIBS-class.Rd +31 -28
  64. data/utils/enveomics/enveomics.R/man/enve.TRIBS.merge.Rd +23 -19
  65. data/utils/enveomics/enveomics.R/man/enve.TRIBStest-class.Rd +36 -26
  66. data/utils/enveomics/enveomics.R/man/enve.__prune.iter.Rd +23 -24
  67. data/utils/enveomics/enveomics.R/man/enve.__prune.reduce.Rd +23 -24
  68. data/utils/enveomics/enveomics.R/man/enve.__tribs.Rd +32 -33
  69. data/utils/enveomics/enveomics.R/man/enve.barplot.Rd +91 -64
  70. data/utils/enveomics/enveomics.R/man/enve.cliopts.Rd +57 -37
  71. data/utils/enveomics/enveomics.R/man/enve.col.alpha.Rd +24 -19
  72. data/utils/enveomics/enveomics.R/man/enve.col2alpha.Rd +19 -18
  73. data/utils/enveomics/enveomics.R/man/enve.df2dist.Rd +39 -26
  74. data/utils/enveomics/enveomics.R/man/enve.df2dist.group.Rd +38 -25
  75. data/utils/enveomics/enveomics.R/man/enve.df2dist.list.Rd +40 -26
  76. data/utils/enveomics/enveomics.R/man/enve.growthcurve.Rd +67 -49
  77. data/utils/enveomics/enveomics.R/man/enve.prune.dist.Rd +37 -28
  78. data/utils/enveomics/enveomics.R/man/enve.recplot.Rd +122 -97
  79. data/utils/enveomics/enveomics.R/man/enve.recplot2-class.Rd +35 -31
  80. data/utils/enveomics/enveomics.R/man/enve.recplot2.ANIr.Rd +24 -23
  81. data/utils/enveomics/enveomics.R/man/enve.recplot2.Rd +68 -51
  82. data/utils/enveomics/enveomics.R/man/enve.recplot2.__counts.Rd +25 -24
  83. data/utils/enveomics/enveomics.R/man/enve.recplot2.__peakHist.Rd +21 -22
  84. data/utils/enveomics/enveomics.R/man/enve.recplot2.__whichClosestPeak.Rd +19 -20
  85. data/utils/enveomics/enveomics.R/man/enve.recplot2.changeCutoff.Rd +19 -18
  86. data/utils/enveomics/enveomics.R/man/enve.recplot2.compareIdentities.Rd +41 -32
  87. data/utils/enveomics/enveomics.R/man/enve.recplot2.coordinates.Rd +29 -24
  88. data/utils/enveomics/enveomics.R/man/enve.recplot2.corePeak.Rd +18 -18
  89. data/utils/enveomics/enveomics.R/man/enve.recplot2.extractWindows.Rd +40 -34
  90. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.Rd +36 -24
  91. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_e.Rd +19 -20
  92. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__em_m.Rd +19 -20
  93. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__emauto_one.Rd +27 -29
  94. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mow_one.Rd +41 -42
  95. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.__mower.Rd +17 -18
  96. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.em.Rd +43 -33
  97. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.emauto.Rd +36 -28
  98. data/utils/enveomics/enveomics.R/man/enve.recplot2.findPeaks.mower.Rd +74 -56
  99. data/utils/enveomics/enveomics.R/man/enve.recplot2.peak-class.Rd +44 -31
  100. data/utils/enveomics/enveomics.R/man/enve.recplot2.seqdepth.Rd +27 -22
  101. data/utils/enveomics/enveomics.R/man/enve.recplot2.windowDepthThreshold.Rd +32 -26
  102. data/utils/enveomics/enveomics.R/man/enve.tribs.Rd +59 -44
  103. data/utils/enveomics/enveomics.R/man/enve.tribs.test.Rd +28 -21
  104. data/utils/enveomics/enveomics.R/man/enve.truncate.Rd +27 -22
  105. data/utils/enveomics/enveomics.R/man/plot.enve.GrowthCurve.Rd +63 -43
  106. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBS.Rd +38 -29
  107. data/utils/enveomics/enveomics.R/man/plot.enve.TRIBStest.Rd +38 -30
  108. data/utils/enveomics/enveomics.R/man/plot.enve.recplot2.Rd +111 -83
  109. data/utils/enveomics/enveomics.R/man/summary.enve.GrowthCurve.Rd +19 -18
  110. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBS.Rd +19 -18
  111. data/utils/enveomics/enveomics.R/man/summary.enve.TRIBStest.Rd +19 -18
  112. data/utils/find-medoid.R +3 -2
  113. data/utils/representatives.rb +5 -3
  114. data/utils/subclade/pipeline.rb +22 -11
  115. data/utils/subclade/runner.rb +5 -1
  116. data/utils/subclades-compile.rb +1 -1
  117. data/utils/subclades.R +9 -3
  118. metadata +15 -4
  119. data/utils/enveomics/enveomics.R/man/enveomics.R-package.Rd +0 -15
  120. data/utils/enveomics/enveomics.R/man/z$-methods.Rd +0 -26
@@ -0,0 +1,124 @@
1
+ # @package MiGA
2
+ # @license Artistic-2.0
3
+
4
+ ##
5
+ # Helper module with daemon configuration functions for MiGA::Cli::Action::Init
6
+ module MiGA::Cli::Action::Init::DaemonHelper
7
+ def configure_daemon
8
+ cli.puts 'Default daemon configuration:'
9
+ daemon_f = File.expand_path('.miga_daemon.json', ENV['HOME'])
10
+ unless File.exist?(daemon_f) and cli.ask_user(
11
+ 'A template daemon already exists, do you want to preserve it?',
12
+ 'yes', %w(yes no)) == 'yes'
13
+ v = {created: Time.now.to_s, updated: Time.now.to_s}
14
+ v[:type] = cli.ask_user(
15
+ 'Please select the type of daemon you want to setup',
16
+ cli[:dtype], %w(bash qsub msub slurm))
17
+ case v[:type]
18
+ when 'bash'
19
+ v = configure_bash_daemon(v)
20
+ when 'slurm'
21
+ v = configure_slurm_daemon(v)
22
+ else # [qm]sub
23
+ v = configure_qsub_msub_daemon(v)
24
+ end
25
+ File.open(daemon_f, 'w') { |fh| fh.puts JSON.pretty_generate(v) }
26
+ end
27
+ cli.puts ''
28
+ end
29
+
30
+ def configure_bash_daemon(v)
31
+ v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '2').to_i
32
+ v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '6').to_i
33
+ v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
34
+ cli.puts 'Setting up internal daemon defaults.'
35
+ cli.puts 'If you don\'t understand this just leave default values:'
36
+ v[:cmd] = cli.ask_user(
37
+ "How should I launch tasks?\n %1$s: script path, " \
38
+ "%2$s: variables, %3$d: CPUs, %4$s: log file, %5$s: task name.\n",
39
+ "%2$s '%1$s' > '%4$s' 2>&1")
40
+ v[:var] = cli.ask_user(
41
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
42
+ "%1$s=%2$s")
43
+ v[:varsep] = cli.ask_user('What should I use to separate variables?', ' ')
44
+ v[:alive] = cli.ask_user(
45
+ "How can I know that a process is still alive?\n %1$s: PID, " \
46
+ "output should be 1 for running and 0 for non-running.\n",
47
+ "ps -p '%1$s'|tail -n+2|wc -l")
48
+ v[:kill] = cli.ask_user(
49
+ "How should I terminate tasks?\n %s: process ID.", "kill -9 '%s'")
50
+ v
51
+ end
52
+
53
+ def configure_slurm_daemon(v)
54
+ queue = cli.ask_user('What queue should I use?', nil, nil, true)
55
+ v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '150').to_i
56
+ v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '300').to_i
57
+ v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
58
+ cli.puts 'Setting up internal daemon defaults'
59
+ cli.puts 'If you don\'t understand this just leave default values:'
60
+ v[:cmd] = cli.ask_user(
61
+ "How should I launch tasks?\n %1$s: script path, " \
62
+ "%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
63
+ "%2$s sbatch --partition='#{queue}' --export=ALL " \
64
+ "--nodes=1 --ntasks-per-node=%3$d --output='%4$s' " \
65
+ "--job-name='%5$s' --mem=9G --time=12:00:00 %1$s " \
66
+ "| perl -pe 's/.* //'")
67
+ v[:var] = cli.ask_user(
68
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
69
+ "%1$s=%2$s")
70
+ v[:varsep] = cli.ask_user(
71
+ 'What should I use to separate variables?', ' ')
72
+ v[:alive] = cli.ask_user(
73
+ "How can I know that a process is still alive?\n %1$s: job id, " \
74
+ "output should be 1 for running and 0 for non-running.\n",
75
+ "squeue -h -o %%t -j '%1$s' | grep '^PD\\|R\\|CF\\|CG$' " \
76
+ "| tail -n 1 | wc -l")
77
+ v[:kill] = cli.ask_user(
78
+ "How should I terminate tasks?\n %s: process ID.", "scancel '%s'")
79
+ v
80
+ end
81
+
82
+ def configure_qsub_msub_daemon
83
+ queue = cli.ask_user('What queue should I use?', nil, nil, true)
84
+ v[:latency] = cli.ask_user('How long should I sleep? (in secs)', '150').to_i
85
+ v[:maxjobs] = cli.ask_user('How many jobs can I launch at once?', '300').to_i
86
+ v[:ppn] = cli.ask_user('How many CPUs can I use per job?', '2').to_i
87
+ cli.puts 'Setting up internal daemon defaults.'
88
+ cli.puts 'If you don\'t understand this just leave default values:'
89
+ v[:cmd] = cli.ask_user(
90
+ "How should I launch tasks?\n %1$s: script path, " \
91
+ "%2$s: variables, %3$d: CPUs, %4$d: log file, %5$s: task name.\n",
92
+ "#{v[:type]} -q '#{queue}' -v '%2$s' -l nodes=1:ppn=%3$d %1$s " \
93
+ "-j oe -o '%4$s' -N '%5$s' -l mem=9g -l walltime=12:00:00 " \
94
+ "| grep .")
95
+ v[:var] = cli.ask_user(
96
+ "How should I pass variables?\n %1$s: keys, %2$s: values.\n",
97
+ "%1$s=%2$s")
98
+ v[:varsep] = cli.ask_user(
99
+ 'What should I use to separate variables?', ',')
100
+ if v[:type] == 'qsub'
101
+ v[:alive] = cli.ask_user(
102
+ "How can I know that a process is still alive?\n " \
103
+ "%1$s: job id, output should be 1 for running and " \
104
+ "0 for non-running.\n",
105
+ "qstat -f '%1$s'|grep ' job_state ='|perl -pe 's/.*= //'" \
106
+ "|grep '[^C]'|tail -n1|wc -l|awk '{print $1}'")
107
+ v[:kill] = cli.ask_user(
108
+ "How should I terminate tasks?\n %s: process ID.", "qdel '%s'")
109
+ else # msub
110
+ v[:alive] = cli.ask_user(
111
+ "How can I know that a process is still alive?\n " \
112
+ "%1$s: job id, output should be 1 for running and " \
113
+ "0 for non-running.\n",
114
+ "checkjob '%1$s'|grep '^State:'|perl -pe 's/.*: //'" \
115
+ "|grep 'Deferred\\|Hold\\|Idle\\|Starting\\|Running\\|Blocked'" \
116
+ "|tail -n1|wc -l|awk '{print $1}'")
117
+ v[:kill] = cli.ask_user(
118
+ "How should I terminate tasks?\n %s: process ID.",
119
+ "canceljob '%s'")
120
+ end
121
+ v
122
+ end
123
+
124
+ end
@@ -6,50 +6,59 @@ require 'miga/cli/action'
6
6
  class MiGA::Cli::Action::Ls < MiGA::Cli::Action
7
7
 
8
8
  def parse_cli
9
- cli.defaults = {info: false, processing: false, silent: false}
9
+ cli.defaults = { info: false, processing: false, silent: false }
10
10
  cli.parse do |opt|
11
11
  cli.opt_object(opt, [:project, :dataset_opt])
12
12
  cli.opt_filter_datasets(opt)
13
13
  opt.on(
14
14
  '-i', '--info',
15
15
  'Print additional information on each dataset'
16
- ){ |v| cli[:info] = v }
16
+ ) { |v| cli[:info] = v }
17
17
  opt.on(
18
18
  '-p', '--processing',
19
19
  'Print information on processing advance'
20
- ){ |v| cli[:processing] = v }
20
+ ) { |v| cli[:processing] = v }
21
21
  opt.on(
22
22
  '-m', '--metadata STRING',
23
23
  'Print name and metadata field only',
24
24
  'If set, ignores -i and assumes --tab'
25
- ){ |v| cli[:datum] = v }
25
+ ) { |v| cli[:datum] = v }
26
26
  opt.on(
27
27
  '--tab',
28
28
  'Return a tab-delimited table'
29
- ){ |v| cli[:tabular] = v }
29
+ ) { |v| cli[:tabular] = v }
30
+ opt.on(
31
+ '-o', '--output PATH',
32
+ 'Create output file instead of returning to STDOUT'
33
+ ) { |v| cli[:output] = v }
30
34
  opt.on(
31
35
  '-s', '--silent',
32
36
  'No output and exit with non-zero status if the dataset list is empty'
33
- ){ |v| cli[:silent] = v }
37
+ ) { |v| cli[:silent] = v }
34
38
  end
35
39
  end
36
40
 
37
41
  def perform
38
42
  ds = cli.load_and_filter_datasets(cli[:silent])
39
43
  exit(ds.empty? ? 1 : 0) if cli[:silent]
44
+ io = cli[:output].nil? ? $stdout : File.open(cli[:output], 'w')
40
45
  if !cli[:datum].nil?
41
46
  ds.each do |d|
42
47
  v = d.metadata[cli[:datum]]
43
- puts "#{d.name}\t#{v.nil? ? '?' : v}"
48
+ cli.puts(io, "#{d.name}\t#{v.nil? ? '?' : v}")
44
49
  end
45
50
  elsif cli[:info]
46
- cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info })
51
+ cli.table(Dataset.INFO_FIELDS, ds.map { |d| d.info }, io)
47
52
  elsif cli[:processing]
48
53
  comp = %w[- done queued]
49
- cli.table([:name] + MiGA::Dataset.PREPROCESSING_TASKS,
50
- ds.map { |d| [d.name] + d.profile_advance.map { |i| comp[i] } })
54
+ cli.table(
55
+ [:name] + MiGA::Dataset.PREPROCESSING_TASKS,
56
+ ds.map { |d| [d.name] + d.profile_advance.map { |i| comp[i] } },
57
+ io
58
+ )
51
59
  else
52
- ds.each { |d| cli.puts d.name }
60
+ ds.each { |d| cli.puts(io, d.name) }
53
61
  end
62
+ io.close unless cli[:output].nil?
54
63
  end
55
64
  end
@@ -6,118 +6,202 @@ require 'miga/remote_dataset'
6
6
  require 'csv'
7
7
 
8
8
  class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
9
-
10
9
  def parse_cli
11
- cli.defaults = {query: false, unlink: false,
10
+ cli.defaults = {
11
+ query: false, unlink: false,
12
12
  reference: false, legacy_name: false,
13
13
  complete: false, chromosome: false,
14
14
  scaffold: false, contig: false, add_version: true, dry: false,
15
- get_md: false, only_md: false, save_every: 1}
15
+ get_md: false, only_md: false, save_every: 1
16
+ }
16
17
  cli.parse do |opt|
17
18
  cli.opt_object(opt, [:project])
18
19
  opt.on(
19
20
  '-T', '--taxon STRING',
20
21
  '(Mandatory unless --reference) Taxon name (e.g., a species binomial)'
21
- ){ |v| cli[:taxon] = v }
22
- opt.on('--reference',
23
- 'Download all reference genomes (ignore any other status)'
24
- ){ |v| cli[:reference] = v }
25
- opt.on(
26
- '--complete',
27
- 'Download complete genomes'
28
- ){ |v| cli[:complete] = v }
29
- opt.on('--chromosome',
30
- 'Download complete chromosomes'
31
- ){ |v| cli[:chromosome] = v }
32
- opt.on(
33
- '--scaffold',
34
- 'Download genomes in scaffolds'
35
- ){ |v| cli[:scaffold] = v }
36
- opt.on(
37
- '--contig',
38
- 'Download genomes in contigs'
39
- ){ |v| cli[:contig] = v }
40
- opt.on(
41
- '--all',
42
- 'Download all genomes (in any status)') do
43
- cli[:complete] = true
44
- cli[:chromosome] = true
45
- cli[:scaffold] = true
46
- cli[:contig] = true
47
- end
48
- opt.on(
49
- '--no-version-name',
50
- 'Do not add sequence version to the dataset name',
51
- 'Only affects --complete and --chromosome'
52
- ){ |v| cli[:add_version] = v }
53
- opt.on(
54
- '--legacy-name',
55
- 'Use dataset names based on chromosome entries instead of assembly'
56
- ){ |v| cli[:legacy_name] = v }
57
- opt.on('--blacklist PATH',
58
- 'A file with dataset names to blacklist'
59
- ){ |v| cli[:blacklist] = v }
60
- opt.on(
61
- '--dry',
62
- 'Do not download or save the datasets'
63
- ){ |v| cli[:dry] = v }
64
- opt.on(
65
- '--ignore-until STRING',
66
- 'Ignores all datasets until a name is found (useful for large reruns)'
67
- ){ |v| cli[:ignore_until] = v }
68
- opt.on(
69
- '--get-metadata',
70
- 'Only download and update metadata for existing datasets'
71
- ){ |v| cli[:get_md] = v }
72
- opt.on('--only-metadata',
73
- 'Create datasets without input data but retrieve all metadata'
74
- ){ |v| cli[:only_md] = v }
75
- opt.on(
76
- '--save-every INT', Integer,
77
- 'Save project every this many downloaded datasets',
78
- 'If zero, it saves the project only once upon completion',
79
- "By default: #{cli[:save_every]}"
80
- ){ |v| cli[:save_every] = v }
22
+ ) { |v| cli[:taxon] = v }
81
23
  opt.on(
82
- '-q', '--query',
83
- 'Register the datasets as queries, not reference datasets'
84
- ){ |v| cli[:query] = v }
85
- opt.on(
86
- '-u', '--unlink',
87
- 'Unlink all datasets in the project missing from the download list'
88
- ){ |v| cli[:unlink] = v }
89
- opt.on('-R', '--remote-list PATH',
90
- 'Path to an output file with the list of all datasets listed remotely'
91
- ){ |v| cli[:remote_list] = v }
24
+ '-m', '--metadata STRING',
25
+ 'Metadata as key-value pairs separated by = and delimited by comma',
26
+ 'Values are saved as strings except for booleans (true / false) or nil'
27
+ ) { |v| cli[:metadata] = v }
28
+ cli_task_flags(opt)
29
+ cli_name_modifiers(opt)
30
+ cli_filters(opt)
31
+ cli_save_actions(opt)
92
32
  opt.on(
93
33
  '--api-key STRING',
94
34
  'NCBI API key'
95
- ){ |v| ENV['NCBI_API_KEY'] = v }
35
+ ) { |v| ENV['NCBI_API_KEY'] = v }
96
36
  end
97
37
  end
98
38
 
99
39
  def perform
40
+ sanitize_cli
41
+ p = cli.load_project
42
+ ds = remote_list
43
+ ds = discard_blacklisted(ds)
44
+ d, downloaded = download_entries(ds, p)
45
+
46
+ # Finalize
47
+ cli.say "Datasets listed: #{d.size}"
48
+ act = cli[:dry] ? 'to download' : 'downloaded'
49
+ cli.say "Datasets #{act}: #{downloaded}"
50
+ unless cli[:remote_list].nil?
51
+ File.open(cli[:remote_list], 'w') do |fh|
52
+ d.each { |i| fh.puts i }
53
+ end
54
+ end
55
+ return unless cli[:unlink]
56
+ unlink = p.dataset_names - d
57
+ unlink.each { |i| p.unlink_dataset(i).remove! }
58
+ cli.say "Datasets unlinked: #{unlink.size}"
59
+ end
60
+
61
+ private
62
+
63
+ def cli_task_flags(opt)
64
+ cli.opt_flag(
65
+ opt, 'reference',
66
+ 'Download all reference genomes (ignore any other status)'
67
+ )
68
+ cli.opt_flag(opt, 'complete', 'Download complete genomes')
69
+ cli.opt_flag(opt, 'chromosome', 'Download complete chromosomes')
70
+ cli.opt_flag(opt, 'scaffold', 'Download genomes in scaffolds')
71
+ cli.opt_flag(opt, 'contig', 'Download genomes in contigs')
72
+ opt.on(
73
+ '--all',
74
+ 'Download all genomes (in any status)'
75
+ ) do
76
+ cli[:complete] = true
77
+ cli[:chromosome] = true
78
+ cli[:scaffold] = true
79
+ cli[:contig] = true
80
+ end
81
+ end
82
+
83
+ def cli_name_modifiers(opt)
84
+ opt.on(
85
+ '--no-version-name',
86
+ 'Do not add sequence version to the dataset name',
87
+ 'Only affects --complete and --chromosome'
88
+ ) { |v| cli[:add_version] = v }
89
+ cli.opt_flag(
90
+ opt, 'legacy-name',
91
+ 'Use dataset names based on chromosome entries instead of assembly',
92
+ :legacy_name
93
+ )
94
+ end
95
+
96
+ def cli_filters(opt)
97
+ opt.on(
98
+ '--blacklist PATH',
99
+ 'A file with dataset names to blacklist'
100
+ ) { |v| cli[:blacklist] = v }
101
+ cli.opt_flag(opt, 'dry', 'Do not download or save the datasets')
102
+ opt.on(
103
+ '--ignore-until STRING',
104
+ 'Ignores all datasets until a name is found (useful for large reruns)'
105
+ ) { |v| cli[:ignore_until] = v }
106
+ cli.opt_flag(
107
+ opt, 'get-metadata',
108
+ 'Only download and update metadata for existing datasets', :get_md)
109
+ end
110
+
111
+ def cli_save_actions(opt)
112
+ cli.opt_flag(
113
+ opt, 'only-metadata',
114
+ 'Create datasets without input data but retrieve all metadata',
115
+ :only_md)
116
+ opt.on(
117
+ '--save-every INT', Integer,
118
+ 'Save project every this many downloaded datasets',
119
+ 'If zero, it saves the project only once upon completion',
120
+ "By default: #{cli[:save_every]}"
121
+ ) { |v| cli[:save_every] = v }
122
+ opt.on(
123
+ '-q', '--query',
124
+ 'Register the datasets as queries, not reference datasets'
125
+ ) { |v| cli[:query] = v }
126
+ opt.on(
127
+ '-u', '--unlink',
128
+ 'Unlink all datasets in the project missing from the download list'
129
+ ) { |v| cli[:unlink] = v }
130
+ opt.on(
131
+ '-R', '--remote-list PATH',
132
+ 'Path to an output file with the list of all datasets listed remotely'
133
+ ) { |v| cli[:remote_list] = v }
134
+ end
135
+
136
+ def sanitize_cli
100
137
  cli.ensure_par(taxon: '-T') unless cli[:reference]
101
- unless %w[reference complete chromosome scaffold contig].any?{ |i| cli[i.to_sym] }
138
+ tasks = %w[reference complete chromosome scaffold contig]
139
+ unless tasks.any? { |i| cli[i.to_sym] }
102
140
  raise 'No action requested: pick at least one type of genome'
103
141
  end
104
142
  cli[:save_every] = 1 if cli[:dry]
143
+ end
105
144
 
106
- p = cli.load_project
107
- d = []
145
+ def remote_list
146
+ cli.say 'Downloading genome list'
108
147
  ds = {}
109
- downloaded = 0
148
+ url = remote_list_url
149
+ doc = RemoteDataset.download_url(url)
150
+ CSV.parse(doc, headers: true).each do |r|
151
+ asm = r['assembly']
152
+ next if asm.nil? || asm.empty? || asm == '-'
153
+ next unless r['ftp_path_genbank']
154
+ rep = remote_row_replicons(r)
155
+ n = remote_row_name(r, rep, asm)
156
+
157
+ # Register for download
158
+ fna_url = '%s/%s_genomic.fna.gz' %
159
+ [r['ftp_path_genbank'], File.basename(r['ftp_path_genbank'])]
160
+ ds[n] = {
161
+ ids: [fna_url], db: :assembly_gz, universe: :web,
162
+ md: {
163
+ type: :genome, ncbi_asm: asm, strain: r['strain']
164
+ }
165
+ }
166
+ ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
167
+ unless r['release_date'].nil?
168
+ ds[n][:md][:release_date] = Time.parse(r['release_date']).to_s
169
+ end
170
+ end
171
+ ds
172
+ end
110
173
 
174
+ def remote_row_replicons(r)
175
+ return if r['replicons'].nil?
176
+ r['replicons']
177
+ .split('; ')
178
+ .map { |i| i.gsub(/.*:/, '') }
179
+ .map { |i| i.gsub(%r{/.*}, '') }
180
+ end
181
+
182
+ def remote_row_name(r, rep, asm)
183
+ return r['#organism'].miga_name if cli[:legacy_name] && cli[:reference]
184
+ if cli[:legacy_name] && ['Complete', ' Chromosome'].include?(r['level'])
185
+ acc = rep.nil? ? '' : rep.first
186
+ else
187
+ acc = asm
188
+ end
189
+ acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
190
+ "#{r['#organism']}_#{acc}".miga_name
191
+ end
192
+
193
+ def remote_list_url
111
194
  url_base = 'https://www.ncbi.nlm.nih.gov/genomes/solr2txt.cgi?'
112
195
  url_param = {
113
- q: '[display()].' +
114
- 'from(GenomeAssemblies).' +
115
- 'usingschema(/schema/GenomeAssemblies).' +
116
- 'matching(tab==["Prokaryotes"] and q=="' + cli[:taxon].tr('"',"'") + '"',
117
- fields: 'organism|organism,assembly|assembly,replicons|replicons,' +
118
- 'level|level,ftp_path_genbank|ftp_path_genbank,release_date|release_date,' +
119
- 'strain|strain',
120
- nolimit: 'on',
196
+ q: '[display()].' \
197
+ 'from(GenomeAssemblies).' \
198
+ 'usingschema(/schema/GenomeAssemblies).' \
199
+ 'matching(tab==["Prokaryotes"] and q=="' \
200
+ "#{cli[:taxon].tr('"', "'")}\"",
201
+ fields: 'organism|organism,assembly|assembly,replicons|replicons,' \
202
+ 'level|level,ftp_path_genbank|ftp_path_genbank,' \
203
+ 'release_date|release_date,strain|strain',
204
+ nolimit: 'on'
121
205
  }
122
206
  if cli[:reference]
123
207
  url_param[:q] += ' and refseq_category==["representative"]'
@@ -131,95 +215,53 @@ class MiGA::Cli::Action::NcbiGet < MiGA::Cli::Action
131
215
  url_param[:q] += ' and level==[' + status + ']'
132
216
  end
133
217
  url_param[:q] += ')'
134
- url = url_base + URI.encode_www_form(url_param)
135
- cli.say 'Downloading genome list'
136
- lineno = 0
137
- doc = RemoteDataset.download_url(url)
138
- CSV.parse(doc, headers: true).each do |r|
139
- asm = r['assembly']
140
- next if asm.nil? or asm.empty? or asm == '-'
141
- next unless r['ftp_path_genbank']
142
-
143
- # Get replicons
144
- rep = r['replicons'].nil? ? nil : r['replicons'].
145
- split('; ').map{ |i| i.gsub(/.*:/,'') }.map{ |i| i.gsub(/\/.*/, '') }
146
-
147
- # Set name
148
- if cli[:legacy_name] and cli[:reference]
149
- n = r['#organism'].miga_name
150
- else
151
- if cli[:legacy_name] and ['Complete',' Chromosome'].include? r['level']
152
- acc = rep.nil? ? '' : rep.first
153
- else
154
- acc = asm
155
- end
156
- acc.gsub!(/\.\d+\Z/, '') unless cli[:add_version]
157
- n = "#{r['#organism']}_#{acc}".miga_name
158
- end
159
-
160
- # Register for download
161
- fna_url = r['ftp_path_genbank'] + '/' +
162
- File.basename(r['ftp_path_genbank']) + '_genomic.fna.gz'
163
- ds[n] = {
164
- ids: [fna_url], db: :assembly_gz, universe: :web,
165
- md: {
166
- type: :genome, ncbi_asm: asm, strain: r['strain']
167
- }
168
- }
169
- ds[n][:md][:ncbi_nuccore] = rep.join(',') unless rep.nil?
170
- ds[n][:md][:release_date] =
171
- Time.parse(r['release_date']).to_s unless r['release_date'].nil?
172
- end
218
+ url_base + URI.encode_www_form(url_param)
219
+ end
173
220
 
174
- # Discard blacklisted
221
+ def discard_blacklisted(ds)
175
222
  unless cli[:blacklist].nil?
176
223
  cli.say "Discarding datasets in #{cli[:blacklist]}"
177
- File.readlines(cli[:blacklist]).
178
- select{ |i| i !~ /^#/ }.map(&:chomp).each{ |i| ds.delete i }
224
+ File.readlines(cli[:blacklist])
225
+ .select { |i| i !~ /^#/ }
226
+ .map(&:chomp)
227
+ .each { |i| ds.delete i }
179
228
  end
229
+ ds
230
+ end
180
231
 
181
- # Download entries
232
+ def download_entries(ds, p)
182
233
  cli.say "Downloading #{ds.size} " + (ds.size == 1 ? 'entry' : 'entries')
183
234
  p.do_not_save = true if cli[:save_every] != 1
184
235
  ignore = !cli[:ignore_until].nil?
236
+ downloaded = 0
237
+ d = []
185
238
  ds.each do |name, body|
186
239
  d << name
187
240
  cli.puts name
188
241
  ignore = false if ignore && name == cli[:ignore_until]
189
- next if ignore
190
- next if p.dataset(name).nil? == cli[:get_md]
242
+ next if ignore || p.dataset(name).nil? == cli[:get_md]
191
243
  downloaded += 1
192
- next if cli[:dry]
193
- cli.say ' Locating remote dataset'
194
- body[:md][:metadata_only] = true if cli[:only_md]
195
- rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
196
- if cli[:get_md]
197
- cli.say ' Updating dataset'
198
- rd.update_metadata(p.dataset(name), body[:md])
199
- else
200
- cli.say ' Creating dataset'
201
- rd.save_to(p, name, !cli[:query], body[:md])
202
- p.add_dataset(name)
244
+ unless cli[:dry]
245
+ save_entry(name, body, p)
246
+ p.save! if cli[:save_every] > 1 && (downloaded % cli[:save_every]).zero?
203
247
  end
204
- p.save! if cli[:save_every] > 1 and (downloaded % cli[:save_every]) == 0
205
248
  end
206
-
207
249
  p.do_not_save = false
208
250
  p.save! if cli[:save_every] != 1
251
+ [d, downloaded]
252
+ end
209
253
 
210
- # Finalize
211
- cli.say "Datasets listed: #{d.size}"
212
- cli.say "Datasets #{cli[:dry] ? 'to download' : 'downloaded'}: " +
213
- downloaded.to_s
214
- unless cli[:remote_list].nil?
215
- File.open(cli[:remote_list], 'w') do |fh|
216
- d.each { |i| fh.puts i }
217
- end
218
- end
219
- if cli[:unlink]
220
- unlink = p.dataset_names - d
221
- unlink.each { |i| p.unlink_dataset(i).remove! }
222
- cli.say "Datasets unlinked: #{unlink.size}"
254
+ def save_entry(name, body, p)
255
+ cli.say ' Locating remote dataset'
256
+ body[:md][:metadata_only] = true if cli[:only_md]
257
+ rd = RemoteDataset.new(body[:ids], body[:db], body[:universe])
258
+ if cli[:get_md]
259
+ cli.say ' Updating dataset'
260
+ rd.update_metadata(p.dataset(name), body[:md])
261
+ else
262
+ cli.say ' Creating dataset'
263
+ rd.save_to(p, name, !cli[:query], body[:md])
264
+ cli.add_metadata(p.add_dataset(name)).save
223
265
  end
224
266
  end
225
267
  end