rbbt-util 5.28.4 → 5.28.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/hpc.rb +5 -2
- data/lib/rbbt/persist.rb +1 -0
- data/lib/rbbt/tsv/accessor.rb +10 -2
- data/lib/rbbt/tsv/dumper.rb +10 -2
- data/lib/rbbt/tsv/parallel/traverse.rb +3 -0
- data/lib/rbbt/tsv/util.rb +5 -1
- data/lib/rbbt/util/config.rb +2 -1
- data/lib/rbbt/util/misc/inspect.rb +1 -1
- data/lib/rbbt/util/misc/system.rb +1 -1
- data/lib/rbbt/util/open.rb +1 -1
- data/lib/rbbt/workflow.rb +1 -0
- data/lib/rbbt/workflow/accessor.rb +94 -93
- data/lib/rbbt/workflow/definition.rb +6 -3
- data/lib/rbbt/workflow/integration/cromwell.rb +4 -2
- data/lib/rbbt/workflow/remote_workflow/driver/rest.rb +5 -1
- data/lib/rbbt/workflow/step.rb +17 -5
- data/lib/rbbt/workflow/step/accessor.rb +9 -6
- data/lib/rbbt/workflow/usage.rb +1 -1
- data/lib/rbbt/workflow/util/archive.rb +1 -0
- data/lib/rbbt/workflow/util/orchestrator.rb +215 -0
- data/lib/rbbt/workflow/util/trace.rb +182 -0
- data/share/rbbt_commands/app/start +2 -2
- data/share/rbbt_commands/purge_job +2 -4
- data/share/rbbt_commands/system/status +1 -1
- data/share/rbbt_commands/workflow/forget_deps +1 -3
- data/share/rbbt_commands/workflow/server +2 -0
- data/test/rbbt/tsv/parallel/test_traverse.rb +14 -0
- data/test/rbbt/tsv/test_manipulate.rb +20 -0
- data/test/rbbt/workflow/test_schedule.rb +0 -0
- data/test/rbbt/workflow/util/test_orchestrator.rb +223 -0
- metadata +8 -3
- data/lib/rbbt/workflow/schedule.rb +0 -238
@@ -93,8 +93,8 @@ class Step
|
|
93
93
|
else
|
94
94
|
Open.write(path + '.read', value.to_s)
|
95
95
|
end
|
96
|
-
when Step ===
|
97
|
-
|
96
|
+
when Step === value
|
97
|
+
value = value.produce.load
|
98
98
|
else
|
99
99
|
Open.write(path, value.to_s)
|
100
100
|
end
|
@@ -110,12 +110,14 @@ class Step
|
|
110
110
|
task_info = workflow.task_info(task_name)
|
111
111
|
input_types = task_info[:input_types]
|
112
112
|
task_inputs = task_info[:inputs]
|
113
|
+
input_defaults = task_info[:input_defaults]
|
113
114
|
|
114
115
|
inputs = {}
|
115
116
|
job.recursive_inputs.zip(job.recursive_inputs.fields).each do |value,name|
|
116
117
|
next unless task_inputs.include? name.to_sym
|
117
118
|
next if options and ! options.include?(name)
|
118
119
|
next if value.nil?
|
120
|
+
next if input_defaults[name] == value
|
119
121
|
inputs[name] = value
|
120
122
|
end
|
121
123
|
|
@@ -125,7 +127,7 @@ class Step
|
|
125
127
|
end
|
126
128
|
save_inputs(inputs, input_types, dir)
|
127
129
|
|
128
|
-
inputs.
|
130
|
+
inputs.keys
|
129
131
|
end
|
130
132
|
|
131
133
|
def name
|
@@ -437,11 +439,12 @@ class Step
|
|
437
439
|
rec_dependencies = self.rec_dependencies
|
438
440
|
return [] if rec_dependencies.empty?
|
439
441
|
canfail_paths = self.canfail_paths
|
442
|
+
|
440
443
|
dirty_files = rec_dependencies.reject{|dep|
|
441
444
|
(defined?(WorkflowRemoteClient) && WorkflowRemoteClient::RemoteStep === dep) ||
|
442
445
|
! Open.exists?(dep.info_file) ||
|
443
446
|
(dep.path && (Open.exists?(dep.path) || Open.remote?(dep.path))) ||
|
444
|
-
((dep.error? || dep.aborted?
|
447
|
+
((dep.error? || dep.aborted?) && (! dep.recoverable_error? || canfail_paths.include?(dep.path)))
|
445
448
|
}
|
446
449
|
end
|
447
450
|
|
@@ -508,12 +511,12 @@ class Step
|
|
508
511
|
|
509
512
|
def nopid?
|
510
513
|
pid = info[:pid] || Open.exists?(pid_file)
|
511
|
-
! pid && ! (status.nil? || status == :aborted || status == :done || status == :error)
|
514
|
+
! pid && ! (status.nil? || status == :aborted || status == :done || status == :error || status == :cleaned)
|
512
515
|
end
|
513
516
|
|
514
517
|
def aborted?
|
515
518
|
status = self.status
|
516
|
-
status == :aborted || ((status != :noinfo && status != :setup && status != :noinfo) && nopid?)
|
519
|
+
status == :aborted || ((status != :cleaned && status != :noinfo && status != :setup && status != :noinfo) && nopid?)
|
517
520
|
end
|
518
521
|
|
519
522
|
# {{{ INFO
|
data/lib/rbbt/workflow/usage.rb
CHANGED
@@ -0,0 +1,215 @@
|
|
1
|
+
require 'rbbt/workflow'
|
2
|
+
|
3
|
+
module Workflow
|
4
|
+
class Orchestrator
|
5
|
+
|
6
|
+
def self.job_workload(job)
|
7
|
+
workload = {job => []}
|
8
|
+
return workload if job.done?
|
9
|
+
|
10
|
+
job.dependencies.each do |dep|
|
11
|
+
next if dep.done?
|
12
|
+
workload.merge!(job_workload(dep))
|
13
|
+
workload[job] += workload[dep]
|
14
|
+
workload[job] << dep
|
15
|
+
workload[job].uniq!
|
16
|
+
end
|
17
|
+
|
18
|
+
job.input_dependencies.each do |dep|
|
19
|
+
next if dep.done?
|
20
|
+
workload.merge!(job_workload(dep))
|
21
|
+
workload[job] += workload[dep]
|
22
|
+
workload[job] << dep
|
23
|
+
workload[job].uniq!
|
24
|
+
end
|
25
|
+
|
26
|
+
workload
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.job_rules(rules, job)
|
30
|
+
workflow = job.workflow.to_s
|
31
|
+
task_name = job.task_name.to_s
|
32
|
+
|
33
|
+
return IndiferentHash.setup(rules["defaults"]) unless rules[workflow]
|
34
|
+
return IndiferentHash.setup(rules["defaults"]) unless rules[workflow][task_name]
|
35
|
+
|
36
|
+
job_rules = IndiferentHash.setup(rules[workflow][task_name])
|
37
|
+
rules["defaults"].each{|k,v| job_rules[k] = v if job_rules[k].nil? } if rules["defaults"]
|
38
|
+
job_rules
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.purge_duplicates(candidates)
|
42
|
+
seen = Set.new
|
43
|
+
candidates.select do |job|
|
44
|
+
if seen.include? job.path
|
45
|
+
false
|
46
|
+
else
|
47
|
+
seen << job.path
|
48
|
+
true
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.job_resources(rules, job)
|
54
|
+
resources = (job_rules(rules, job) || {})["resources"] || {}
|
55
|
+
|
56
|
+
IndiferentHash.setup(resources)
|
57
|
+
|
58
|
+
default_resources = rules["default_resources"] || rules["defaults"]["resources"]
|
59
|
+
default_resources.each{|k,v| resources[k] ||= v } if default_resources
|
60
|
+
|
61
|
+
resources
|
62
|
+
end
|
63
|
+
|
64
|
+
def self.sort_candidates(candidates, rules)
|
65
|
+
seen = Set.new
|
66
|
+
candidates.sort_by do |job|
|
67
|
+
- job_resources(rules, job).values.inject(0){|acc,e| acc += e}
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.candidates(workload, rules)
|
72
|
+
if rules.empty?
|
73
|
+
candidates = workload.select{|k,v| v.empty? }.
|
74
|
+
collect{|k,v| k}.
|
75
|
+
reject{|k| k.done? }
|
76
|
+
else
|
77
|
+
candidates = workload. #select{|k,v| Orchestrator.job_rules(rules, k) }.
|
78
|
+
select{|k,v| v.empty? }.
|
79
|
+
collect{|k,v| k }.
|
80
|
+
reject{|k| k.done? }
|
81
|
+
end
|
82
|
+
|
83
|
+
top_level = workload.keys - workload.values.flatten
|
84
|
+
|
85
|
+
candidates = purge_duplicates candidates
|
86
|
+
candidates = sort_candidates candidates, rules
|
87
|
+
|
88
|
+
candidates
|
89
|
+
end
|
90
|
+
|
91
|
+
attr_accessor :available_resources, :resources_requested, :resources_used, :timer
|
92
|
+
|
93
|
+
def initialize(timer = 5, available_resources = {})
|
94
|
+
@timer = timer
|
95
|
+
@available_resources = IndiferentHash.setup(available_resources)
|
96
|
+
@resources_requested = IndiferentHash.setup({})
|
97
|
+
@resources_used = IndiferentHash.setup({})
|
98
|
+
end
|
99
|
+
|
100
|
+
def release_resources(job)
|
101
|
+
if resources_used[job]
|
102
|
+
Log.debug "Orchestrator releasing resouces from #{job.path}"
|
103
|
+
resources_used[job].each do |resource,value|
|
104
|
+
next if resource == 'size'
|
105
|
+
resources_requested[resource] -= value.to_i
|
106
|
+
end
|
107
|
+
resources_used.delete job
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def check_resources(rules, job)
|
112
|
+
resources = Orchestrator.job_resources(rules, job)
|
113
|
+
|
114
|
+
limit_resources = resources.select{|resource,value| available_resources[resource] && ((resources_requested[resource] || 0) + value) > available_resources[resource] }.collect{|resource,v| resource }
|
115
|
+
if limit_resources.any?
|
116
|
+
Log.debug "Orchestrator waiting on #{job.path} due to #{limit_resources * ", "}"
|
117
|
+
else
|
118
|
+
|
119
|
+
resources_used[job] = resources
|
120
|
+
resources.each do |resource,value|
|
121
|
+
resources_requested[resource] ||= 0
|
122
|
+
resources_requested[resource] += value.to_i
|
123
|
+
end
|
124
|
+
Log.low "Orchestrator producing #{job.path} with resources #{resources}"
|
125
|
+
|
126
|
+
return yield
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def run_with_rules(rules, job)
|
131
|
+
job_rules = Orchestrator.job_rules(rules, job)
|
132
|
+
|
133
|
+
Rbbt::Config.with_config do
|
134
|
+
job_rules[:config_keys].each do |config|
|
135
|
+
Rbbt::Config.process_config config
|
136
|
+
end if job_rules && job_rules[:config_keys]
|
137
|
+
|
138
|
+
log = job_rules[:log] if job_rules
|
139
|
+
log = Log.severity if log.nil?
|
140
|
+
Log.with_severity log do
|
141
|
+
job.produce(false, true)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def erase_job_dependencies(job, rules, workload, top_level_jobs)
|
147
|
+
job.dependencies.each do |dep|
|
148
|
+
next if top_level_jobs.include? dep.path
|
149
|
+
next unless Orchestrator.job_rules(rules, dep)["erase"].to_s == 'true'
|
150
|
+
|
151
|
+
list = (workload.keys - [job]).collect{|pending| pending.dependencies}.flatten
|
152
|
+
next if list.include?(dep)
|
153
|
+
|
154
|
+
Log.high "Erasing #{dep.path} from #{job.path}"
|
155
|
+
job.archive_deps
|
156
|
+
job.copy_files_dir
|
157
|
+
job.dependencies = job.dependencies - [dep]
|
158
|
+
dep.clean
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def process(rules, jobs)
|
163
|
+
begin
|
164
|
+
|
165
|
+
workload = jobs.inject({}){|acc,job| acc.merge!(Orchestrator.job_workload(job)) }
|
166
|
+
|
167
|
+
top_level_jobs = jobs.collect{|job| job.path }
|
168
|
+
while workload.any?
|
169
|
+
|
170
|
+
candidates = resources_used.keys + Orchestrator.candidates(workload, rules)
|
171
|
+
raise "No candidates and no running jobs" if candidates.empty?
|
172
|
+
|
173
|
+
candidates.each do |job|
|
174
|
+
case
|
175
|
+
when (job.error? || job.aborted?)
|
176
|
+
begin
|
177
|
+
if job.recoverable_error?
|
178
|
+
job.clean
|
179
|
+
raise TryAgain
|
180
|
+
else
|
181
|
+
next
|
182
|
+
end
|
183
|
+
ensure
|
184
|
+
Log.warn "Releases resources from failed job: #{job.path}"
|
185
|
+
release_resources(job)
|
186
|
+
end
|
187
|
+
when job.done?
|
188
|
+
Log.debug "Orchestrator done #{job.path}"
|
189
|
+
release_resources(job)
|
190
|
+
erase_job_dependencies(job, rules, workload, top_level_jobs)
|
191
|
+
|
192
|
+
when job.running?
|
193
|
+
next
|
194
|
+
|
195
|
+
else
|
196
|
+
check_resources(rules, job) do
|
197
|
+
run_with_rules(rules, job)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
new_workload = {}
|
203
|
+
workload.each do |k,v|
|
204
|
+
next if k.done?
|
205
|
+
new_workload[k] = v.reject{|d| d.done? || (d.error? && ! d.recoverable_error?)}
|
206
|
+
end
|
207
|
+
workload = new_workload
|
208
|
+
sleep timer
|
209
|
+
end
|
210
|
+
rescue TryAgain
|
211
|
+
retry
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
require 'rbbt/util/R'
|
2
|
+
|
3
|
+
module Workflow
|
4
|
+
def self.trace(seed_jobs, options = {})
|
5
|
+
|
6
|
+
jobs = []
|
7
|
+
seed_jobs.each{|j| jobs << j; jobs += j.rec_dependencies}
|
8
|
+
|
9
|
+
data = TSV.setup({}, "Job~Workflow,Task,Start,End#:type=:list")
|
10
|
+
min_start = nil
|
11
|
+
max_done = nil
|
12
|
+
jobs.each do |job|
|
13
|
+
next unless job.info[:done]
|
14
|
+
started = job.info[:started]
|
15
|
+
ddone = job.info[:done]
|
16
|
+
|
17
|
+
code = [job.workflow, job.task_name].compact.collect{|s| s.to_s} * "."
|
18
|
+
code = code + '.' + job.name
|
19
|
+
|
20
|
+
data[code] = [job.workflow.to_s, job.task_name, started, ddone]
|
21
|
+
if min_start.nil?
|
22
|
+
min_start = started
|
23
|
+
else
|
24
|
+
min_start = started if started < min_start
|
25
|
+
end
|
26
|
+
|
27
|
+
if max_done.nil?
|
28
|
+
max_done = ddone
|
29
|
+
else
|
30
|
+
max_done = ddone if ddone > max_done
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
data.add_field "Start.second" do |k,value|
|
35
|
+
value["Start"] - min_start
|
36
|
+
end
|
37
|
+
|
38
|
+
data.add_field "End.second" do |k,value|
|
39
|
+
value["End"] - min_start
|
40
|
+
end
|
41
|
+
|
42
|
+
if options[:fix_gap]
|
43
|
+
ranges = []
|
44
|
+
data.through do |k,values|
|
45
|
+
start, eend = values.values_at "Start.second", "End.second"
|
46
|
+
|
47
|
+
ranges << (start..eend)
|
48
|
+
end
|
49
|
+
|
50
|
+
gaps = {}
|
51
|
+
last = nil
|
52
|
+
Misc.collapse_ranges(ranges).each do |range|
|
53
|
+
start = range.begin
|
54
|
+
eend = range.end
|
55
|
+
if last
|
56
|
+
gaps[last] = start - last
|
57
|
+
end
|
58
|
+
last = eend
|
59
|
+
end
|
60
|
+
|
61
|
+
data.process "End.second" do |value,k,values|
|
62
|
+
gap = Misc.sum(gaps.select{|pos,size| pos < values["Start.second"]}.collect{|pos,size| size})
|
63
|
+
value - gap
|
64
|
+
end
|
65
|
+
|
66
|
+
data.process "Start.second" do |value,k,values|
|
67
|
+
gap = Misc.sum(gaps.select{|pos,size| pos < values["Start.second"]}.collect{|pos,size| size})
|
68
|
+
value - gap
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
tasks_info = {}
|
73
|
+
|
74
|
+
jobs.each do |dep|
|
75
|
+
next unless dep.info[:done]
|
76
|
+
task = [dep.workflow, dep.task_name].compact.collect{|s| s.to_s} * "#"
|
77
|
+
info = tasks_info[task] ||= {}
|
78
|
+
|
79
|
+
time = dep.info[:done] - dep.info[:started]
|
80
|
+
info[:time] ||= []
|
81
|
+
info[:time] << time
|
82
|
+
|
83
|
+
cpus = nil
|
84
|
+
spark = false
|
85
|
+
shard = false
|
86
|
+
dep.info[:config_keys].select do |kinfo|
|
87
|
+
key, value, tokens = kinfo
|
88
|
+
key = key.to_s
|
89
|
+
cpus = value if key.include? 'cpu'
|
90
|
+
spark = value if key == 'spark'
|
91
|
+
shard = value if key == 'shard'
|
92
|
+
end
|
93
|
+
|
94
|
+
info[:cpus] = cpus || 1
|
95
|
+
info[:spark] = spark
|
96
|
+
info[:shard] = shard
|
97
|
+
end
|
98
|
+
|
99
|
+
stats = TSV.setup({}, "Task~Calls,Avg. Time,Total Time,Cpus,Spark,Shard#:type=:list")
|
100
|
+
|
101
|
+
tasks_info.each do |task, info|
|
102
|
+
time_lists, cpus, spark, shard = info.values_at :time, :cpus, :spark, :shard
|
103
|
+
avg_time = Misc.mean(time_lists)
|
104
|
+
total_time = Misc.sum(time_lists)
|
105
|
+
calls = time_lists.length
|
106
|
+
stats[task] = [calls, avg_time, total_time, cpus, spark, shard]
|
107
|
+
end
|
108
|
+
|
109
|
+
raise "No jobs to process" if data.size == 0
|
110
|
+
|
111
|
+
start = data.column("Start.second").values.flatten.collect{|v| v.to_f}.min
|
112
|
+
eend = data.column("End.second").values.flatten.collect{|v| v.to_f}.max
|
113
|
+
total = eend - start
|
114
|
+
Log.info "Total time elapsed: #{total} seconds"
|
115
|
+
|
116
|
+
if options[:fix_gap]
|
117
|
+
total_gaps = Misc.sum(gaps.collect{|k,v| v})
|
118
|
+
Log.info "Total gaps: #{total_gaps} seconds"
|
119
|
+
end
|
120
|
+
|
121
|
+
plot, width, height = options.values_at :plot, :width, :height
|
122
|
+
if plot
|
123
|
+
data.R <<-EOF, [:svg]
|
124
|
+
rbbt.require('tidyverse')
|
125
|
+
rbbt.require('ggplot2')
|
126
|
+
|
127
|
+
names(data) <- make.names(names(data))
|
128
|
+
data$id = rownames(data)
|
129
|
+
data$content = data$Task
|
130
|
+
data$start = data$Start
|
131
|
+
data$end = data$End
|
132
|
+
data$Project = data$Workflow
|
133
|
+
|
134
|
+
tasks = data
|
135
|
+
|
136
|
+
#theme_gantt <- function(base_size=11, base_family="Source Sans Pro Light") {
|
137
|
+
theme_gantt <- function(base_size=11, base_family="Sans Serif") {
|
138
|
+
ret <- theme_bw(base_size, base_family) %+replace%
|
139
|
+
theme(panel.background = element_rect(fill="#ffffff", colour=NA),
|
140
|
+
axis.title.x=element_text(vjust=-0.2), axis.title.y=element_text(vjust=1.5),
|
141
|
+
title=element_text(vjust=1.2, family="Source Sans Pro Semibold"),
|
142
|
+
panel.border = element_blank(), axis.line=element_blank(),
|
143
|
+
panel.grid.minor=element_blank(),
|
144
|
+
panel.grid.major.y = element_blank(),
|
145
|
+
panel.grid.major.x = element_line(size=0.5, colour="grey80"),
|
146
|
+
axis.ticks=element_blank(),
|
147
|
+
legend.position="bottom",
|
148
|
+
axis.title=element_text(size=rel(1.2), family="Source Sans Pro Semibold"),
|
149
|
+
strip.text=element_text(size=rel(1.5), family="Source Sans Pro Semibold"),
|
150
|
+
strip.background=element_rect(fill="#ffffff", colour=NA),
|
151
|
+
panel.spacing.y=unit(1.5, "lines"),
|
152
|
+
legend.key = element_blank())
|
153
|
+
|
154
|
+
ret
|
155
|
+
}
|
156
|
+
|
157
|
+
tasks.long <- tasks %>%
|
158
|
+
gather(date.type, task.date, -c(Project, Task, id, Start.second, End.second)) %>%
|
159
|
+
arrange(date.type, task.date) %>%
|
160
|
+
mutate(id = factor(id, levels=rev(unique(id)), ordered=TRUE))
|
161
|
+
|
162
|
+
x.breaks <- seq(length(tasks$Task) + 0.5 - 3, 0, by=-3)
|
163
|
+
|
164
|
+
timeline <- ggplot(tasks.long, aes(y=id, yend=id, x=Start.second, xend=End.second, colour=Task)) +
|
165
|
+
geom_segment() +
|
166
|
+
geom_vline(xintercept=x.breaks, colour="grey80", linetype="dotted") +
|
167
|
+
guides(colour=guide_legend(title=NULL)) +
|
168
|
+
labs(x=NULL, y=NULL) +
|
169
|
+
theme_gantt() + theme(axis.text.x=element_text(angle=45, hjust=1))
|
170
|
+
|
171
|
+
rbbt.png_plot('#{plot}', 'plot(timeline)', width=#{width}, height=#{height}, pointsize=6)
|
172
|
+
EOF
|
173
|
+
end
|
174
|
+
|
175
|
+
if options[:plot_data]
|
176
|
+
data
|
177
|
+
else
|
178
|
+
stats
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
end
|