scout-gear 10.8.4 → 10.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. checksums.yaml +4 -4
  2. data/.vimproject +38 -0
  3. data/README.md +352 -0
  4. data/VERSION +1 -1
  5. data/bin/scout +4 -1
  6. data/doc/Association.md +288 -0
  7. data/doc/Entity.md +296 -0
  8. data/doc/KnowledgeBase.md +433 -0
  9. data/doc/Persist.md +356 -0
  10. data/doc/Semaphore.md +171 -0
  11. data/doc/TSV.md +449 -0
  12. data/doc/WorkQueue.md +359 -0
  13. data/doc/Workflow.md +586 -0
  14. data/lib/scout/association.rb +4 -2
  15. data/lib/scout/entity/identifiers.rb +1 -1
  16. data/lib/scout/entity/object.rb +1 -1
  17. data/lib/scout/entity/property.rb +5 -5
  18. data/lib/scout/entity.rb +1 -1
  19. data/lib/scout/knowledge_base/description.rb +1 -1
  20. data/lib/scout/knowledge_base/list.rb +7 -2
  21. data/lib/scout/knowledge_base/registry.rb +4 -5
  22. data/lib/scout/knowledge_base.rb +20 -2
  23. data/lib/scout/monitor.rb +10 -6
  24. data/lib/scout/persist/engine/packed_index.rb +2 -2
  25. data/lib/scout/persist/engine/sharder.rb +1 -1
  26. data/lib/scout/persist/tsv.rb +1 -0
  27. data/lib/scout/semaphore.rb +1 -1
  28. data/lib/scout/tsv/dumper.rb +3 -3
  29. data/lib/scout/tsv/open.rb +1 -0
  30. data/lib/scout/tsv/parser.rb +1 -1
  31. data/lib/scout/tsv/transformer.rb +1 -0
  32. data/lib/scout/tsv/util.rb +2 -2
  33. data/lib/scout/work_queue/socket.rb +1 -1
  34. data/lib/scout/work_queue/worker.rb +7 -5
  35. data/lib/scout/workflow/definition.rb +11 -0
  36. data/lib/scout/workflow/deployment/local.rb +288 -0
  37. data/lib/scout/workflow/deployment/orchestrator/batches.rb +130 -0
  38. data/lib/scout/workflow/deployment/orchestrator/chains.rb +104 -0
  39. data/lib/scout/workflow/deployment/orchestrator/rules.rb +256 -0
  40. data/lib/scout/workflow/deployment/orchestrator/workload.rb +67 -0
  41. data/lib/scout/workflow/deployment/scheduler/job.rb +740 -0
  42. data/lib/scout/workflow/deployment/scheduler/lfs.rb +125 -0
  43. data/lib/scout/workflow/deployment/scheduler/pbs.rb +176 -0
  44. data/lib/scout/workflow/deployment/scheduler/slurm.rb +158 -0
  45. data/lib/scout/workflow/deployment/scheduler.rb +73 -0
  46. data/lib/scout/workflow/deployment.rb +10 -1
  47. data/lib/scout/workflow/entity.rb +22 -1
  48. data/lib/scout/workflow/exceptions.rb +2 -0
  49. data/lib/scout/workflow/step/config.rb +6 -3
  50. data/lib/scout/workflow/step/file.rb +4 -0
  51. data/lib/scout/workflow/step/info.rb +10 -4
  52. data/lib/scout/workflow/step/progress.rb +52 -0
  53. data/lib/scout/workflow/step.rb +39 -5
  54. data/lib/scout/workflow/task/inputs.rb +1 -1
  55. data/lib/scout/workflow/task.rb +2 -0
  56. data/lib/scout/workflow/usage.rb +3 -2
  57. data/lib/scout/workflow/util.rb +22 -0
  58. data/scout-gear.gemspec +37 -7
  59. data/scout_commands/batch/list +1 -1
  60. data/scout_commands/cat +86 -0
  61. data/scout_commands/doc +3 -1
  62. data/scout_commands/entity +151 -0
  63. data/scout_commands/system/status +238 -0
  64. data/scout_commands/workflow/cmd +5 -13
  65. data/scout_commands/workflow/info +23 -10
  66. data/scout_commands/workflow/install +1 -1
  67. data/scout_commands/workflow/task +61 -25
  68. data/test/scout/entity/test_property.rb +1 -1
  69. data/test/scout/knowledge_base/test_registry.rb +19 -0
  70. data/test/scout/test_work_queue.rb +1 -1
  71. data/test/scout/work_queue/test_worker.rb +12 -10
  72. data/test/scout/workflow/deployment/orchestrator/test_batches.rb +138 -0
  73. data/test/scout/workflow/deployment/orchestrator/test_chains.rb +171 -0
  74. data/test/scout/workflow/deployment/orchestrator/test_rules.rb +219 -0
  75. data/test/scout/workflow/deployment/orchestrator/test_workload.rb +117 -0
  76. data/test/scout/workflow/deployment/scheduler/test_job.rb +31 -0
  77. data/test/scout/workflow/deployment/scheduler/test_lfs.rb +32 -0
  78. data/test/scout/workflow/deployment/scheduler/test_pbs.rb +32 -0
  79. data/test/scout/workflow/deployment/scheduler/test_slurm.rb +32 -0
  80. data/test/scout/workflow/deployment/{test_orchestrator.rb → test_local.rb} +161 -33
  81. data/test/scout/workflow/deployment/test_scheduler.rb +75 -0
  82. data/test/scout/workflow/deployment/test_trace.rb +1 -1
  83. data/test/scout/workflow/step/test_progress.rb +27 -0
  84. data/test/scout/workflow/task/test_inputs.rb +17 -0
  85. data/test/test_helper.rb +2 -1
  86. metadata +36 -6
  87. data/doc/lib/scout/path.md +0 -35
  88. data/doc/lib/scout/workflow/task.md +0 -13
  89. data/lib/scout/workflow/deployment/orchestrator.rb +0 -292
@@ -0,0 +1,125 @@
1
+ require_relative 'job'
2
+ require 'scout'
3
+
4
+ module LSF
5
+ extend SchedulerJob
6
+
7
+ def self.system
8
+ "LSF"
9
+ end
10
+
11
+ def self.batch_system_variables
12
+ <<-EOF
13
+ let TOTAL_PROCESORS="$(cat /proc/cpuinfo|grep ^processor |wc -l)"
14
+ let MAX_MEMORY_DEFAULT="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / ( (1024 * $TOTAL_PROCESORS) / $LSB_MAX_NUM_PROCESSORS )"
15
+ [ ! -z $LSB_MAX_MEM_RUSAGE ] && let MAX_MEMORY="$LSB_MAX_MEM_RUSAGE" || MAX_MEMORY="$MAX_MEMORY_DEFAULT"
16
+ export MAX_MEMORY_DEFAULT
17
+ export MAX_MEMORY
18
+ export BATCH_JOB_ID=$LSF_JOBID
19
+ export BATCH_SYSTEM=#{system}
20
+ EOF
21
+ end
22
+
23
+ def self.header(options = {})
24
+ options = options.dup
25
+
26
+ queue = IndiferentHash.process_options options, :queue
27
+ task_cpus = IndiferentHash.process_options options, :task_cpus
28
+ time = IndiferentHash.process_options options, :time
29
+ nodes = IndiferentHash.process_options options, :nodes
30
+ workdir = IndiferentHash.process_options options, :workdir
31
+ exclusive = IndiferentHash.process_options options, :exclusive
32
+
33
+ batch_dir = IndiferentHash.process_options options, :batch_dir
34
+ batch_name = IndiferentHash.process_options options, :batch_name
35
+ batch_name ||= File.basename(batch_dir)
36
+
37
+ fout = File.join(batch_dir, 'std.out')
38
+ ferr = File.join(batch_dir, 'std.err')
39
+
40
+ time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
41
+
42
+ time = time.split(":").values_at(0, 1) * ":"
43
+
44
+ header =<<-EOF
45
+ #!/bin/bash
46
+ #BSUB -J "#{batch_name}"
47
+ #BSUB -cwd "#{workdir}"
48
+ #BSUB -oo "#{fout}"
49
+ #BSUB -eo "#{ferr}"
50
+ #BSUB -q "#{queue}"
51
+ #BSUB -n "#{task_cpus}"
52
+ #BSUB -W "#{time}"
53
+ EOF
54
+
55
+ header << "#BSUB -x" << "\n" if exclusive
56
+
57
+ header
58
+ end
59
+
60
+ def self.run_template(batch_dir, dry_run)
61
+
62
+ fout = File.join(batch_dir, 'std.out')
63
+ ferr = File.join(batch_dir, 'std.err')
64
+ fjob = File.join(batch_dir, 'job.id')
65
+ fdep = File.join(batch_dir, 'dependencies.list')
66
+ fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
67
+ fexit = File.join(batch_dir, 'exit.status')
68
+ fsync = File.join(batch_dir, 'sync.log')
69
+ fcmd = File.join(batch_dir, 'command.batch')
70
+
71
+ return if Open.exists?(fexit)
72
+
73
+ STDERR.puts Log.color(:magenta, "Issuing LSF file: #{fcmd}")
74
+ STDERR.puts Open.read(fcmd)
75
+
76
+ if File.exist?(fjob)
77
+ job = Open.read(fjob).to_i
78
+ else
79
+
80
+ dependencies = Open.read(fdep).split("\n") if File.exist? fdep
81
+ canfail_dependencies = Open.read(fcfdep).split("\n") if File.exist? fcfdep
82
+
83
+ normal_dep_list = dependencies && dependencies.any? ? dependencies.collect{|d| "post_done(#{d})"} : []
84
+ canfail_dep_list = canfail_dependencies && canfail_dependencies.any? ? canfail_dependencies.collect{|d| "done(#{d})"} : []
85
+
86
+ dep_list = normal_dep_list + canfail_dep_list
87
+
88
+ if dep_list.any?
89
+ dep_str = '-w "' + dep_list * " && " + '"'
90
+ else
91
+ dep_str = ""
92
+ end
93
+
94
+ cmd = "bsub #{dep_str} < '#{fcmd}'"
95
+
96
+ if File.exist?(fout)
97
+ return
98
+ elsif dry_run
99
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, cmd)
100
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local scout): ") + Log.color(:blue, "scout lsf tail '#{batch_dir}'")
101
+ raise DryRun, batch_dir
102
+ else
103
+ Open.rm fsync
104
+ Open.rm fexit
105
+ Open.rm fout
106
+ Open.rm ferr
107
+
108
+
109
+ job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
110
+ Log.debug "BSUB job id: #{job}"
111
+ Open.write(fjob, job.to_s)
112
+ job
113
+ end
114
+ end
115
+ end
116
+
117
+ def self.job_status(job = nil)
118
+ if job.nil?
119
+ CMD.cmd("bjobs -w").read
120
+ else
121
+ CMD.cmd("bjobs -w #{job}").read
122
+ end
123
+ end
124
+ end
125
+
@@ -0,0 +1,176 @@
1
+ require_relative 'job'
2
+ require 'scout'
3
+
4
+ module PBS
5
+ extend SchedulerJob
6
+
7
+ def self.system
8
+ "PBS"
9
+ end
10
+
11
+ def self.batch_system_variables
12
+ <<-EOF
13
+ let TOTAL_PROCESORS="$(cat /proc/cpuinfo|grep ^processor |wc -l)"
14
+ let MAX_MEMORY_DEFAULT="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / ( (1024 * $TOTAL_PROCESORS) / $PBS_CPUS_PER_TASK )"
15
+ MAX_MEMORY="$MAX_MEMORY_DEFAULT"
16
+ [ ! -z $PBS_MEM_PER_CPU ] && let MAX_MEMORY="$PBS_MEM_PER_CPU * $PBS_CPUS_PER_TASK"
17
+ [ ! -z $PBS_MEM_PER_NODE ] && MAX_MEMORY="$PBS_MEM_PER_NODE"
18
+ export MAX_MEMORY_DEFAULT
19
+ export MAX_MEMORY
20
+ export BATCH_JOB_ID=$PBS_JOBID
21
+ export BATCH_SYSTEM=#{system}
22
+
23
+ cd ${PBS_O_WORKDIR}
24
+ EOF
25
+ end
26
+
27
+ def self.header(options = {})
28
+ options = options.dup
29
+
30
+ workdir = IndiferentHash.process_options options, :workdir
31
+ batch_dir = IndiferentHash.process_options options, :batch_dir
32
+ batch_name = IndiferentHash.process_options options, :batch_name
33
+
34
+ queue = IndiferentHash.process_options options, :queue
35
+ account = IndiferentHash.process_options options, :account
36
+ time = IndiferentHash.process_options options, :time
37
+ nodes = IndiferentHash.process_options options, :nodes
38
+
39
+ # PBS
40
+ place = IndiferentHash.process_options options, :place, :place => 'scatter'
41
+ system = IndiferentHash.process_options options, :partition
42
+ filesystems = IndiferentHash.process_options options, :filesystems
43
+
44
+ filesystems = "home" if filesystems.nil?
45
+
46
+ filesystems = filesystems * "," if Array === filesystems
47
+
48
+ # NOT USED
49
+ partition = IndiferentHash.process_options options, :partition
50
+ task_cpus = IndiferentHash.process_options options, :task_cpus
51
+ exclusive = IndiferentHash.process_options options, :exclusive
52
+ highmem = IndiferentHash.process_options options, :highmem
53
+ licenses = IndiferentHash.process_options options, :licenses
54
+ constraint = IndiferentHash.process_options options, :constraint
55
+ gres = IndiferentHash.process_options options, :gres
56
+
57
+ constraint = [constraint, "highmem"].compact * "&" if highmem
58
+
59
+ mem = IndiferentHash.process_options options, :mem
60
+ mem_per_cpu = IndiferentHash.process_options options, :mem_per_cpu
61
+
62
+ fout = File.join(batch_dir, 'std.out')
63
+ ferr = File.join(batch_dir, 'std.err')
64
+
65
+ time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
66
+
67
+ qsub_params = { "-l filesystems=" => filesystems,
68
+ "-l system=" => system,
69
+ "-l select=" => nodes,
70
+ "-l place=" => place,
71
+ "-l walltime=" => time,
72
+ "-q " => queue,
73
+ "-A " => account,
74
+ "-o " => fout,
75
+ "-e " => ferr,
76
+ "-k doe" => true,
77
+ # "cpus-per-task" => task_cpus,
78
+ # "nodes" => nodes,
79
+ # "time" => time,
80
+ # "constraint" => constraint,
81
+ # "exclusive" => exclusive,
82
+ # "licenses" => licenses,
83
+ # "gres" => gres,
84
+ # "mem" => mem,
85
+ # "mem-per-cpu" => mem_per_cpu,
86
+ }
87
+
88
+
89
+ header =<<-EOF
90
+ #!/bin/bash
91
+ EOF
92
+
93
+ qsub_params.each do |name,value|
94
+ next if value.nil? || value == ""
95
+ if TrueClass === value
96
+ header << "#PBS #{name}" << "\n"
97
+ elsif Array === value
98
+ value.each do |v|
99
+ header << "#PBS #{name}\"#{v}\"" << "\n"
100
+ end
101
+ else
102
+ header << "#PBS #{name}\"#{value}\"" << "\n"
103
+ end
104
+ end
105
+
106
+ header
107
+ end
108
+
109
+ def self.run_template(batch_dir, dry_run)
110
+
111
+ fout = File.join(batch_dir, 'std.out')
112
+ ferr = File.join(batch_dir, 'std.err')
113
+ fjob = File.join(batch_dir, 'job.id')
114
+ fdep = File.join(batch_dir, 'dependencies.list')
115
+ fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
116
+ fexit = File.join(batch_dir, 'exit.status')
117
+ fsync = File.join(batch_dir, 'sync.log')
118
+ fcmd = File.join(batch_dir, 'command.batch')
119
+
120
+ return if Open.exists?(fexit)
121
+
122
+ Log.info "Issuing PBS file: #{fcmd}"
123
+ Log.debug Open.read(fcmd)
124
+
125
+ if File.exist?(fjob)
126
+ job = Open.read(fjob).to_i
127
+ else
128
+
129
+ dependencies = Open.read(fdep).split("\n") if File.exist? fdep
130
+ canfail_dependencies = Open.read(fcfdep).split("\n") if File.exist? fcfdep
131
+
132
+ normal_dep_str = dependencies && dependencies.any? ? "afterok:" + dependencies * ":" : nil
133
+ canfail_dep_str = canfail_dependencies && canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
134
+
135
+ if normal_dep_str.nil? && canfail_dep_str.nil?
136
+ dep_str = ""
137
+ else
138
+ dep_str = '-W depend=' + [normal_dep_str, canfail_dep_str].compact * ","
139
+ end
140
+
141
+ cmd = "qsub #{dep_str} '#{fcmd}'"
142
+
143
+ if File.exist?(fout)
144
+ return
145
+ elsif dry_run
146
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "squb '#{fcmd}'")
147
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt pbs tail '#{batch_dir}'")
148
+ raise DryRun, batch_dir
149
+ else
150
+ Open.rm fsync
151
+ Open.rm fexit
152
+ Open.rm fout
153
+ Open.rm ferr
154
+
155
+ job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
156
+ Log.debug "SBATCH job id: #{job}"
157
+ Open.write(fjob, job.to_s)
158
+ job
159
+ end
160
+ end
161
+ end
162
+
163
+ def self.job_status(job = nil)
164
+ if job.nil?
165
+ CMD.cmd("qstat").read
166
+ else
167
+ begin
168
+ CMD.cmd("qstat #{job}").read
169
+ rescue
170
+ ""
171
+ end
172
+ end
173
+ end
174
+
175
+ end
176
+
@@ -0,0 +1,158 @@
1
+ require_relative 'job'
2
+ require 'scout'
3
+
4
+ module SLURM
5
+
6
+ extend SchedulerJob
7
+
8
+ def self.system
9
+ "SLURM"
10
+ end
11
+
12
+
13
+ def self.batch_system_variables
14
+ <<-EOF
15
+ let TOTAL_PROCESORS="$(cat /proc/cpuinfo|grep ^processor |wc -l)"
16
+ let MAX_MEMORY_DEFAULT="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / ( (1024 * $TOTAL_PROCESORS) / $SLURM_CPUS_PER_TASK )"
17
+ MAX_MEMORY="$MAX_MEMORY_DEFAULT"
18
+ [ ! -z $SLURM_MEM_PER_CPU ] && let MAX_MEMORY="$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK"
19
+ [ ! -z $SLURM_MEM_PER_NODE ] && MAX_MEMORY="$SLURM_MEM_PER_NODE"
20
+ export MAX_MEMORY_DEFAULT
21
+ export MAX_MEMORY
22
+ export BATCH_JOB_ID=$SLURM_JOB_ID
23
+ export BATCH_SYSTEM=#{system}
24
+ EOF
25
+ end
26
+
27
+ def self.header(options = {})
28
+ options = options.dup
29
+
30
+ queue = IndiferentHash.process_options options, :queue
31
+ account = IndiferentHash.process_options options, :account
32
+ partition = IndiferentHash.process_options options, :partition
33
+ task_cpus = IndiferentHash.process_options options, :task_cpus
34
+ time = IndiferentHash.process_options options, :time
35
+ nodes = IndiferentHash.process_options options, :nodes
36
+ workdir = IndiferentHash.process_options options, :workdir
37
+ exclusive = IndiferentHash.process_options options, :exclusive
38
+ highmem = IndiferentHash.process_options options, :highmem
39
+ licenses = IndiferentHash.process_options options, :licenses
40
+ constraints = IndiferentHash.process_options options, :constraints
41
+ gres = IndiferentHash.process_options options, :gres
42
+
43
+ constraints = [constraints, "highmem"].compact * "&" if highmem
44
+
45
+ mem = IndiferentHash.process_options options, :mem
46
+ mem_per_cpu = IndiferentHash.process_options options, :mem_per_cpu
47
+
48
+ batch_dir = IndiferentHash.process_options options, :batch_dir
49
+ batch_name = IndiferentHash.process_options options, :batch_name
50
+
51
+ fout = File.join(batch_dir, 'std.out')
52
+ ferr = File.join(batch_dir, 'std.err')
53
+
54
+ time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
55
+
56
+ sbatch_params = {"job-name" => batch_name,
57
+ "qos" => queue,
58
+ "account" => account,
59
+ "partition" => partition,
60
+ "output" => fout,
61
+ "error" => ferr,
62
+ "cpus-per-task" => task_cpus,
63
+ "nodes" => nodes,
64
+ "time" => time,
65
+ "constraints" => constraints,
66
+ "exclusive" => exclusive,
67
+ "licenses" => licenses,
68
+ "gres" => gres,
69
+ "mem" => mem,
70
+ "mem-per-cpu" => mem_per_cpu,
71
+ }
72
+
73
+
74
+ header =<<-EOF
75
+ #!/bin/bash
76
+ EOF
77
+
78
+ sbatch_params.each do |name,value|
79
+ next if value.nil? || value == ""
80
+ if TrueClass === value
81
+ header << "#SBATCH --#{name}" << "\n"
82
+ elsif Array === value
83
+ value.each do |v|
84
+ header << "#SBATCH --#{name}=\"#{v}\"" << "\n"
85
+ end
86
+ else
87
+ header << "#SBATCH --#{name}=\"#{value}\"" << "\n"
88
+ end
89
+ end
90
+
91
+ header
92
+ end
93
+
94
+ def self.run_template(batch_dir, dry_run)
95
+
96
+ fout = File.join(batch_dir, 'std.out')
97
+ ferr = File.join(batch_dir, 'std.err')
98
+ fjob = File.join(batch_dir, 'job.id')
99
+ fdep = File.join(batch_dir, 'dependencies.list')
100
+ fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
101
+ fexit = File.join(batch_dir, 'exit.status')
102
+ fsync = File.join(batch_dir, 'sync.log')
103
+ fcmd = File.join(batch_dir, 'command.batch')
104
+
105
+ return if Open.exists?(fexit)
106
+
107
+ Log.info "Issuing SLURM file: #{fcmd}"
108
+
109
+ if File.exist?(fjob)
110
+ job = Open.read(fjob).to_i
111
+ else
112
+
113
+ dependencies = Open.read(fdep).split("\n") if File.exist? fdep
114
+ canfail_dependencies = Open.read(fcfdep).split("\n") if File.exist? fcfdep
115
+
116
+ normal_dep_str = dependencies && dependencies.any? ? "afterok:" + dependencies * ":" : nil
117
+ canfail_dep_str = canfail_dependencies && canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
118
+
119
+ if normal_dep_str.nil? && canfail_dep_str.nil?
120
+ dep_str = ""
121
+ else
122
+ dep_str = '--dependency=' + [normal_dep_str, canfail_dep_str].compact * ","
123
+ end
124
+
125
+ cmd = "sbatch #{dep_str} '#{fcmd}'"
126
+
127
+ if File.exist?(fout)
128
+ return
129
+ elsif dry_run
130
+ STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{fcmd}'")
131
+ STDERR.puts Log.color(:magenta, "To monitor progress run (needs local scout): ") + Log.color(:blue, "scout slurm tail '#{batch_dir}'")
132
+ raise DryRun, batch_dir
133
+ else
134
+ Open.rm fsync
135
+ Open.rm fexit
136
+ Open.rm fout
137
+ Open.rm ferr
138
+
139
+ job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
140
+ Log.debug "SBATCH job id: #{job}"
141
+ Open.write(fjob, job.to_s)
142
+ job
143
+ end
144
+ end
145
+ end
146
+
147
+ def self.job_status(job = nil)
148
+ if job.nil?
149
+ CMD.cmd("squeue").read
150
+ else
151
+ begin
152
+ CMD.cmd("squeue --job #{job}").read
153
+ rescue
154
+ ""
155
+ end
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,73 @@
1
+ require_relative 'orchestrator/chains'
2
+ require_relative 'orchestrator/rules'
3
+ require_relative 'orchestrator/batches'
4
+
5
+ require_relative 'scheduler/slurm'
6
+ require_relative 'scheduler/pbs'
7
+ require_relative 'scheduler/lfs'
8
+
9
+ module Workflow::Scheduler
10
+ def self.produce(jobs, rules = {}, options = {})
11
+ batches = Workflow::Orchestrator.job_batches(rules, jobs)
12
+ Workflow::Scheduler.process_batches(batches, options)
13
+ end
14
+
15
+ def self.process_batches(batches, process_options = {})
16
+ failed_jobs = []
17
+
18
+ pending = batches.dup
19
+
20
+ sorted = []
21
+ while pending.any?
22
+ leaf_nodes = batches.select{|batch| (batch[:deps] - sorted).empty? }
23
+ sorted.concat(leaf_nodes - sorted)
24
+ pending -= leaf_nodes
25
+ end
26
+
27
+ batch_system = Scout::Config.get :system, :batch, :scheduler, 'env:BATCH_SYSTEM', default: 'SLURM'
28
+
29
+ batch_ids = {}
30
+ sorted.collect do |batch|
31
+ job_options = batch[:rules]
32
+ job_options = IndiferentHash.add_defaults job_options, process_options.dup
33
+
34
+ if batch[:deps].nil?
35
+ batch_dependencies = []
36
+ else
37
+ top_jobs = batch[:jobs]
38
+
39
+ batch_dependencies = batch[:deps].collect{|dep|
40
+ dep_target = dep[:top_level]
41
+ id = batch_ids[dep_target].to_s
42
+
43
+ if dep_target.canfail?
44
+ 'canfail:' + id
45
+ else
46
+ id
47
+ end
48
+ }
49
+ end
50
+
51
+ job_options.merge!(:batch_dependencies => batch_dependencies )
52
+ job_options.merge!(:manifest => batch[:jobs].collect{|d| d.task_signature })
53
+
54
+ begin
55
+ id, dir = case batch_system
56
+ when 'SLURM'
57
+ SLURM.run_job(batch[:top_level], job_options)
58
+ when 'LSF'
59
+ LSF.run_job(batch[:top_level], job_options)
60
+ when 'PBS'
61
+ PBS.run_job(batch[:top_level], job_options)
62
+ when nil
63
+ raise "No batch system specified"
64
+ else
65
+ raise "Unknown batch system #{batch_system}"
66
+ end
67
+ batch_ids[batch[:top_level]] = id
68
+ rescue DryRun
69
+ $!.message
70
+ end
71
+ end
72
+ end
73
+ end
@@ -1,2 +1,11 @@
1
- require_relative 'deployment/orchestrator'
1
+ require_relative 'deployment/local'
2
+ require_relative 'deployment/scheduler'
2
3
  require_relative 'deployment/trace'
4
+ require_relative 'deployment/queue'
5
+
6
+ module Workflow
7
+ def self.produce(jobs, ...)
8
+ rules = Workflow::Orchestrator.load_rules_for_job(jobs)
9
+ Workflow::LocalExecutor.produce(jobs, rules, ...)
10
+ end
11
+ end
@@ -71,10 +71,31 @@ module EntityWorkflow
71
71
  end
72
72
 
73
73
  property_name = task_name.to_s.sub(/^(#{entity_name}_list|#{entity_name}|list)_/, '')
74
+ property_job_name = property_name + '_job'
75
+
76
+ property property_job_name => property_type do |*args|
77
+ job(task_name, *args)
78
+ end
79
+
74
80
  property property_name => property_type do |*args|
75
- job = job(task_name, *args)
81
+ job = self.send(property_job_name)
82
+
83
+ job.join if job.running?
84
+
85
+ if job.error?
86
+ if job.recoverable_error?
87
+ job.clean
88
+ else
89
+ raise job.exception
90
+ end
91
+ end
92
+
93
+ job.run unless job.done?
94
+
95
+ job.load
76
96
  Array === job ? job.collect(&:run) : job.run
77
97
  end
98
+
78
99
  end
79
100
 
80
101
  def entity_task(task_name, *args, &block)
@@ -1 +1,3 @@
1
1
  class TaskNotFound < StandardError; end
2
+ class DryRun < Exception; end
3
+
@@ -8,10 +8,13 @@ class Step
8
8
  new_tokens = []
9
9
  if workflow
10
10
  workflow_name = workflow.name
11
- new_tokens << ("workflow:" << workflow_name)
12
- new_tokens << ("task:" << workflow_name << "#" << task_name.to_s)
11
+ new_tokens << ("workflow:" + workflow_name)
12
+ new_tokens << ("task:" + workflow_name << "#" << task_name.to_s)
13
13
  end
14
- new_tokens << ("task:" << task_name.to_s)
14
+ new_tokens << ("task:" + task_name.to_s)
15
+ new_tokens << (task_name.to_s)
16
+ new_tokens << (workflow_name)
17
+ new_tokens << ("task")
15
18
 
16
19
  Scout::Config.get(key, tokens + new_tokens, options)
17
20
  end
@@ -13,6 +13,10 @@ class Step
13
13
  end
14
14
  end
15
15
 
16
+ def files_dir=(dir)
17
+ @files_dir = dir
18
+ end
19
+
16
20
  def file(file = nil)
17
21
  dir = files_dir
18
22
  Path.setup(dir) unless Path === dir
@@ -38,8 +38,8 @@ class Step
38
38
  save_info(@info = info)
39
39
  end
40
40
 
41
- def init_info
42
- log :waiting unless info_file.nil? || Open.exists?(info_file)
41
+ def init_info(status=:waiting)
42
+ log status unless info_file.nil? || Open.exists?(info_file)
43
43
  end
44
44
 
45
45
  def info
@@ -120,7 +120,7 @@ class Step
120
120
  if info.include?(key)
121
121
  case info[key]
122
122
  when Array
123
- info[key].concat Array === value ? value : [value]
123
+ info[key].concat(Array === value ? value : [value])
124
124
  when Hash
125
125
  info[key].merge! value
126
126
  else
@@ -201,7 +201,13 @@ class Step
201
201
  end
202
202
 
203
203
  def exception
204
- info[:exception]
204
+ return nil unless info[:exception]
205
+ begin
206
+ Marshal.load(Base64.decode64(info[:exception]))
207
+ rescue
208
+ Log.exception $!
209
+ nil
210
+ end
205
211
  end
206
212
 
207
213
  # Marshal Step