rbbt-util 5.28.2 → 5.28.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -81,9 +81,10 @@ module Workflow
81
81
  if forget
82
82
  remove = config :remove_dep_tasks, :remove_dep_tasks, :default => REMOVE_DEP_TASKS
83
83
  self.archive_deps
84
+ self.copy_files_dir
84
85
  self.dependencies = self.dependencies - [dep]
85
86
  Open.rm_rf self.files_dir if Open.exist? self.files_dir
86
- FileUtils.cp_r dep.files_dir, self.files_dir if Open.exist? dep.files_dir
87
+ FileUtils.cp_r dep.files_dir, self.files_dir if Open.exist?(dep.files_dir)
87
88
  Open.ln_h dep.path, self.tmp_path
88
89
  case remove.to_s
89
90
  when 'true'
@@ -92,8 +93,10 @@ module Workflow
92
93
  dep.recursive_clean
93
94
  end
94
95
  else
95
- Open.rm_rf self.files_dir
96
- Open.link dep.files_dir, self.files_dir
96
+ if Open.exists?(dep.files_dir)
97
+ Open.rm_rf self.files_dir
98
+ Open.link dep.files_dir, self.files_dir
99
+ end
97
100
  Open.link dep.path, self.path
98
101
  end
99
102
  nil
@@ -1,13 +1,20 @@
1
- module Workflow
1
+ module Cromwell
2
2
 
3
3
  Rbbt.claim Rbbt.software.opt.jar["cromwell.jar"], :url, "https://github.com/broadinstitute/cromwell/releases/download/48/cromwell-48.jar"
4
4
  Rbbt.claim Rbbt.software.opt.jar["wdltool.jar"], :url, "https://github.com/broadinstitute/wdltool/releases/download/0.14/wdltool-0.14.jar"
5
5
 
6
- def run_cromwell(file, work_dir, options = {})
6
+ def self.run_cromwell(file, work_dir, options = {})
7
+ cromwell_inputs_file = Misc.process_options options, :cromwell_inputs_file
7
8
  jar = Rbbt.software.opt.jar["cromwell.jar"].produce.find
8
- CMD.cmd_log("java -jar '#{jar}' run '#{file}' --workflow-root='#{work_dir}'", options.merge("add_option_dashes" => true))
9
+ if cromwell_inputs_file
10
+ CMD.cmd_log("java -jar '#{jar}' run '#{file}' --workflow-root='#{work_dir}' -i #{cromwell_inputs_file}", options.merge("add_option_dashes" => true))
11
+ else
12
+ CMD.cmd_log("java -jar '#{jar}' run '#{file}' --workflow-root='#{work_dir}'", options.merge("add_option_dashes" => true))
13
+ end
9
14
  end
15
+ end
10
16
 
17
+ module Workflow
11
18
  def load_cromwell(file)
12
19
  jar = Rbbt.software.opt.jar["wdltool.jar"].produce.find
13
20
  inputs = JSON.load(CMD.cmd("java -jar '#{jar}' inputs '#{file}'"))
@@ -85,7 +85,11 @@ class RemoteWorkflow
85
85
 
86
86
  RemoteWorkflow::REST.__prepare_inputs_for_restclient(params)
87
87
  name = RemoteWorkflow.capture_exception do
88
- RestClient.post(self.encode(url), params)
88
+ begin
89
+ RestClient.post(self.encode(url), params)
90
+ rescue RestClient::MovedPermanently, RestClient::Found, RestClient::TemporaryRedirect
91
+ raise RbbtException, "REST end-point moved to: #{$!.response.headers[:location]}"
92
+ end
89
93
  end
90
94
 
91
95
  Log.debug{ "RestClient jobname returned for #{ url } - #{Misc.fingerprint params}: #{name}" }
@@ -84,6 +84,7 @@ class Step
84
84
  end
85
85
 
86
86
  def load_dependencies_from_info
87
+ relocated = nil
87
88
  @dependencies = (self.info[:dependencies] || []).collect do |task,name,dep_path|
88
89
  if Open.exists?(dep_path) || Open.exists?(dep_path + '.info')
89
90
  Workflow._load_step dep_path
@@ -108,6 +109,14 @@ class Step
108
109
  @inputs || []
109
110
  end
110
111
 
112
+ def copy_files_dir
113
+ if File.symlink?(self.files_dir)
114
+ realpath = Open.realpath(self.files_dir)
115
+ Open.rm self.files_dir
116
+ Open.cp realpath, self.files_dir
117
+ end
118
+ end
119
+
111
120
  def archive_deps
112
121
  self.set_info :archived_info, archived_info
113
122
  self.set_info :archived_dependencies, info[:dependencies]
@@ -411,7 +420,7 @@ class Step
411
420
  return
412
421
  end
413
422
 
414
- if (Open.exists?(path) or Open.broken_link?(path)) or Open.exists?(pid_file) or Open.exists?(info_file) or Open.exists?(files_dir)
423
+ if (Open.exists?(path) or Open.broken_link?(path)) or Open.exists?(pid_file) or Open.exists?(info_file) or Open.exists?(files_dir) or Open.broken_link?(files_dir)
415
424
 
416
425
  @result = nil
417
426
  @pid = nil
@@ -419,8 +428,8 @@ class Step
419
428
  Misc.insist do
420
429
  Open.rm info_file if Open.exists?(info_file)
421
430
  Open.rm md5_file if Open.exists?(md5_file)
422
- Open.rm path if (Open.exists?(path) or Open.broken_link?(path))
423
- Open.rm_rf files_dir if Open.exists?(files_dir)
431
+ Open.rm path if (Open.exists?(path) || Open.broken_link?(path))
432
+ Open.rm_rf files_dir if Open.exists?(files_dir) || Open.broken_link?(files_dir)
424
433
  Open.rm pid_file if Open.exists?(pid_file)
425
434
  Open.rm tmp_path if Open.exists?(tmp_path)
426
435
  end
@@ -454,12 +463,15 @@ class Step
454
463
  return [] if dependencies.nil? or dependencies.empty?
455
464
 
456
465
  new_dependencies = []
466
+ archived_deps = self.info[:archived_info] ? self.info[:archived_info].keys : []
467
+
457
468
  dependencies.each{|step|
458
469
  #next if self.done? && Open.exists?(info_file) && info[:dependencies] && info[:dependencies].select{|task,name,path| path == step.path }.empty?
459
- next if seen.include? step
470
+ next if archived_deps.include? step.path
471
+ next if seen.include? step.path
460
472
  next if self.done? && need_run && ! updatable?
461
473
 
462
- r = step.rec_dependencies(need_run, new_dependencies)
474
+ r = step.rec_dependencies(need_run, new_dependencies.collect{|d| d.path})
463
475
  new_dependencies.concat r
464
476
  new_dependencies << step
465
477
  }
@@ -93,8 +93,8 @@ class Step
93
93
  else
94
94
  Open.write(path + '.read', value.to_s)
95
95
  end
96
- when Step === v
97
- v = v.produce.load
96
+ when Step === value
97
+ value = value.produce.load
98
98
  else
99
99
  Open.write(path, value.to_s)
100
100
  end
@@ -110,12 +110,14 @@ class Step
110
110
  task_info = workflow.task_info(task_name)
111
111
  input_types = task_info[:input_types]
112
112
  task_inputs = task_info[:inputs]
113
+ input_defaults = task_info[:input_defaults]
113
114
 
114
115
  inputs = {}
115
116
  job.recursive_inputs.zip(job.recursive_inputs.fields).each do |value,name|
116
117
  next unless task_inputs.include? name.to_sym
117
118
  next if options and ! options.include?(name)
118
119
  next if value.nil?
120
+ next if input_defaults[name] == value
119
121
  inputs[name] = value
120
122
  end
121
123
 
@@ -125,7 +127,7 @@ class Step
125
127
  end
126
128
  save_inputs(inputs, input_types, dir)
127
129
 
128
- inputs.any?
130
+ inputs.keys
129
131
  end
130
132
 
131
133
  def name
@@ -437,11 +439,12 @@ class Step
437
439
  rec_dependencies = self.rec_dependencies
438
440
  return [] if rec_dependencies.empty?
439
441
  canfail_paths = self.canfail_paths
442
+
440
443
  dirty_files = rec_dependencies.reject{|dep|
441
444
  (defined?(WorkflowRemoteClient) && WorkflowRemoteClient::RemoteStep === dep) ||
442
445
  ! Open.exists?(dep.info_file) ||
443
446
  (dep.path && (Open.exists?(dep.path) || Open.remote?(dep.path))) ||
444
- ((dep.error? || dep.aborted? || dep.waiting?) && (! dep.recoverable_error? || canfail_paths.include?(dep.path)))
447
+ ((dep.error? || dep.aborted?) && (! dep.recoverable_error? || canfail_paths.include?(dep.path)))
445
448
  }
446
449
  end
447
450
 
@@ -508,12 +511,12 @@ class Step
508
511
 
509
512
  def nopid?
510
513
  pid = info[:pid] || Open.exists?(pid_file)
511
- ! pid && ! (status.nil? || status == :aborted || status == :done || status == :error)
514
+ ! pid && ! (status.nil? || status == :aborted || status == :done || status == :error || status == :cleaned)
512
515
  end
513
516
 
514
517
  def aborted?
515
518
  status = self.status
516
- status == :aborted || ((status != :noinfo && status != :setup && status != :noinfo) && nopid?)
519
+ status == :aborted || ((status != :cleaned && status != :noinfo && status != :setup && status != :noinfo) && nopid?)
517
520
  end
518
521
 
519
522
  # {{{ INFO
@@ -240,7 +240,7 @@ module Workflow
240
240
 
241
241
  inputs.each do |input, type, file|
242
242
  case type
243
- when :tsv, :array, :text
243
+ when :tsv, :array, :text, :file
244
244
  lines = file.read.split("\n")
245
245
  head = lines[0..5].compact * "\n\n"
246
246
  head = head[0..500]
@@ -262,6 +262,7 @@ puts resource[path].find(search_path)
262
262
 
263
263
  job_files.each do |file|
264
264
  begin
265
+ Log.debug "Purging #{file}"
265
266
  Open.rm_rf file if Open.exists?(file)
266
267
  rescue
267
268
  Log.warn "Could not erase '#{file}': #{$!.message}"
@@ -0,0 +1,190 @@
1
+ require 'rbbt/workflow'
2
+
3
+ module Workflow
4
+ class Orchestrator
5
+
6
+ def self.job_workload(job)
7
+ workload = {job => []}
8
+ return workload if job.done?
9
+
10
+ job.dependencies.each do |dep|
11
+ next if dep.done?
12
+ workload.merge!(job_workload(dep))
13
+ workload[job] += workload[dep]
14
+ workload[job] << dep
15
+ end
16
+
17
+ job.input_dependencies.each do |dep|
18
+ next if dep.done?
19
+ workload.merge!(job_workload(dep))
20
+ workload[job] += workload[dep]
21
+ workload[job] << dep
22
+ end
23
+
24
+ workload
25
+ end
26
+
27
+ def self.job_rules(rules, job)
28
+ workflow = job.workflow.to_s
29
+ task_name = job.task_name.to_s
30
+
31
+ return IndiferentHash.setup(rules["defaults"]) unless rules[workflow]
32
+ return IndiferentHash.setup(rules["defaults"]) unless rules[workflow][task_name]
33
+
34
+ job_rules = IndiferentHash.setup(rules[workflow][task_name])
35
+ rules["defaults"].each{|k,v| job_rules[k] ||= v } if rules["defaults"]
36
+ job_rules
37
+ end
38
+
39
+ def self.purge_duplicates(candidates)
40
+ seen = Set.new
41
+ candidates.select do |job|
42
+ if seen.include? job.path
43
+ false
44
+ else
45
+ seen << job.path
46
+ true
47
+ end
48
+ end
49
+ end
50
+
51
+ def self.job_resources(rules, job)
52
+ resources = (job_rules(rules, job) || {})["resources"] || {}
53
+
54
+ IndiferentHash.setup(resources)
55
+
56
+ default_resources = rules["default_resources"] || rules["defaults"]["resources"]
57
+ default_resources.each{|k,v| resources[k] ||= v } if default_resources
58
+
59
+ resources
60
+ end
61
+
62
+ def self.sort_candidates(candidates, rules)
63
+ seen = Set.new
64
+ candidates.sort_by do |job|
65
+ - job_resources(rules, job).values.inject(0){|acc,e| acc += e}
66
+ end
67
+ end
68
+
69
+ def self.candidates(workload, rules)
70
+ if rules.empty?
71
+ candidates = workload.select{|k,v| v.empty? }.
72
+ collect{|k,v| k}.
73
+ reject{|k| k.done? }
74
+ else
75
+ candidates = workload. #select{|k,v| Orchestrator.job_rules(rules, k) }.
76
+ select{|k,v| v.empty? }.
77
+ collect{|k,v| k }.
78
+ reject{|k| k.done? }
79
+ end
80
+
81
+ top_level = workload.keys - workload.values.flatten
82
+
83
+ candidates = purge_duplicates candidates
84
+ candidates = sort_candidates candidates, rules
85
+
86
+ candidates
87
+ end
88
+
89
+ attr_accessor :available_resources, :resources_requested, :resources_used, :timer
90
+
91
+ def initialize(timer = 5, available_resources = {})
92
+ @timer = timer
93
+ @available_resources = IndiferentHash.setup(available_resources)
94
+ @resources_requested = IndiferentHash.setup({})
95
+ @resources_used = IndiferentHash.setup({})
96
+ end
97
+
98
+ def release_resources(job)
99
+ if resources_used[job]
100
+ resources_used[job].each do |resource,value|
101
+ next if resource == 'size'
102
+ resources_requested[resource] -= value.to_i
103
+ end
104
+ resources_used.delete job
105
+ end
106
+ end
107
+
108
+ def check_resources(rules, job)
109
+ resources = Orchestrator.job_resources(rules, job)
110
+
111
+ limit_resources = resources.select{|resource,value| available_resources[resource] && ((resources_requested[resource] || 0) + value) > available_resources[resource] }.collect{|resource,v| resource }
112
+ if limit_resources.any?
113
+ Log.debug "Orchestrator waiting on #{job.path} due to #{limit_resources * ", "}"
114
+ else
115
+
116
+ resources_used[job] = resources
117
+ resources.each do |resource,value|
118
+ resources_requested[resource] ||= 0
119
+ resources_requested[resource] += value.to_i
120
+ end
121
+ Log.low "Orchestrator producing #{job.path} with resources #{resources}"
122
+
123
+ return yield
124
+ end
125
+ end
126
+
127
+ def run_with_rules(rules, job)
128
+ job_rules = Orchestrator.job_rules(rules, job)
129
+
130
+ Rbbt::Config.with_config do
131
+ job_rules[:config_keys].each do |config|
132
+ Rbbt::Config.process_config config
133
+ end if job_rules && job_rules[:config_keys]
134
+
135
+ log = job_rules[:log] if job_rules
136
+ log = Log.severity if log.nil?
137
+ Log.with_severity log do
138
+ job.produce(false, true)
139
+ end
140
+ end
141
+ end
142
+
143
+ def process(rules, jobs)
144
+ begin
145
+
146
+ workload = jobs.inject({}){|acc,job| acc.merge!(Orchestrator.job_workload(job)) }
147
+
148
+ while workload.values.flatten.any?
149
+
150
+ candidates = resources_used.keys + Orchestrator.candidates(workload, rules)
151
+ raise "No candidates" if candidates.empty?
152
+
153
+ candidates.each do |job|
154
+ case
155
+ when (job.error? || job.aborted?)
156
+ if job.recoverable_error?
157
+ job.clean
158
+ raise TryAgain
159
+ else
160
+ next
161
+ end
162
+ release_resources(job)
163
+ when job.done?
164
+ Log.debug "Orchestrator done #{job.path}"
165
+ release_resources(job)
166
+ raise TryAgain
167
+
168
+ when job.running?
169
+ next
170
+
171
+ else
172
+ check_resources(rules, job) do
173
+ run_with_rules(rules, job)
174
+ end
175
+ end
176
+ end
177
+
178
+ new_workload = {}
179
+ workload.each do |k,v|
180
+ next if k.done?
181
+ new_workload[k] = v.reject{|d| d.done? || (d.error? && ! d.recoverable_error?)}
182
+ end
183
+ sleep timer
184
+ end
185
+ rescue TryAgain
186
+ retry
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,182 @@
1
+ require 'rbbt/util/R'
2
+
3
+ module Workflow
4
+ def self.trace(seed_jobs, options = {})
5
+
6
+ jobs = []
7
+ seed_jobs.each{|j| jobs << j; jobs += j.rec_dependencies}
8
+
9
+ data = TSV.setup({}, "Job~Workflow,Task,Start,End#:type=:list")
10
+ min_start = nil
11
+ max_done = nil
12
+ jobs.each do |job|
13
+ next unless job.info[:done]
14
+ started = job.info[:started]
15
+ ddone = job.info[:done]
16
+
17
+ code = [job.workflow, job.task_name].compact.collect{|s| s.to_s} * "."
18
+ code = code + '.' + job.name
19
+
20
+ data[code] = [job.workflow.to_s, job.task_name, started, ddone]
21
+ if min_start.nil?
22
+ min_start = started
23
+ else
24
+ min_start = started if started < min_start
25
+ end
26
+
27
+ if max_done.nil?
28
+ max_done = ddone
29
+ else
30
+ max_done = ddone if ddone > max_done
31
+ end
32
+ end
33
+
34
+ data.add_field "Start.second" do |k,value|
35
+ value["Start"] - min_start
36
+ end
37
+
38
+ data.add_field "End.second" do |k,value|
39
+ value["End"] - min_start
40
+ end
41
+
42
+ if options[:fix_gap]
43
+ ranges = []
44
+ data.through do |k,values|
45
+ start, eend = values.values_at "Start.second", "End.second"
46
+
47
+ ranges << (start..eend)
48
+ end
49
+
50
+ gaps = {}
51
+ last = nil
52
+ Misc.collapse_ranges(ranges).each do |range|
53
+ start = range.begin
54
+ eend = range.end
55
+ if last
56
+ gaps[last] = start - last
57
+ end
58
+ last = eend
59
+ end
60
+
61
+ data.process "End.second" do |value,k,values|
62
+ gap = Misc.sum(gaps.select{|pos,size| pos < values["Start.second"]}.collect{|pos,size| size})
63
+ value - gap
64
+ end
65
+
66
+ data.process "Start.second" do |value,k,values|
67
+ gap = Misc.sum(gaps.select{|pos,size| pos < values["Start.second"]}.collect{|pos,size| size})
68
+ value - gap
69
+ end
70
+ end
71
+
72
+ tasks_info = {}
73
+
74
+ jobs.each do |dep|
75
+ next unless dep.info[:done]
76
+ task = [dep.workflow, dep.task_name].compact.collect{|s| s.to_s} * "#"
77
+ info = tasks_info[task] ||= {}
78
+
79
+ time = dep.info[:done] - dep.info[:started]
80
+ info[:time] ||= []
81
+ info[:time] << time
82
+
83
+ cpus = nil
84
+ spark = false
85
+ shard = false
86
+ dep.info[:config_keys].select do |kinfo|
87
+ key, value, tokens = kinfo
88
+ key = key.to_s
89
+ cpus = value if key.include? 'cpu'
90
+ spark = value if key == 'spark'
91
+ shard = value if key == 'shard'
92
+ end
93
+
94
+ info[:cpus] = cpus || 1
95
+ info[:spark] = spark
96
+ info[:shard] = shard
97
+ end
98
+
99
+ stats = TSV.setup({}, "Task~Calls,Avg. Time,Total Time,Cpus,Spark,Shard#:type=:list")
100
+
101
+ tasks_info.each do |task, info|
102
+ time_lists, cpus, spark, shard = info.values_at :time, :cpus, :spark, :shard
103
+ avg_time = Misc.mean(time_lists)
104
+ total_time = Misc.sum(time_lists)
105
+ calls = time_lists.length
106
+ stats[task] = [calls, avg_time, total_time, cpus, spark, shard]
107
+ end
108
+
109
+ raise "No jobs to process" if data.size == 0
110
+
111
+ start = data.column("Start.second").values.flatten.collect{|v| v.to_f}.min
112
+ eend = data.column("End.second").values.flatten.collect{|v| v.to_f}.max
113
+ total = eend - start
114
+ Log.info "Total time elapsed: #{total} seconds"
115
+
116
+ if options[:fix_gap]
117
+ total_gaps = Misc.sum(gaps.collect{|k,v| v})
118
+ Log.info "Total gaps: #{total_gaps} seconds"
119
+ end
120
+
121
+ plot, width, height = options.values_at :plot, :width, :height
122
+ if plot
123
+ data.R <<-EOF, [:svg]
124
+ rbbt.require('tidyverse')
125
+ rbbt.require('ggplot2')
126
+
127
+ names(data) <- make.names(names(data))
128
+ data$id = rownames(data)
129
+ data$content = data$Task
130
+ data$start = data$Start
131
+ data$end = data$End
132
+ data$Project = data$Workflow
133
+
134
+ tasks = data
135
+
136
+ #theme_gantt <- function(base_size=11, base_family="Source Sans Pro Light") {
137
+ theme_gantt <- function(base_size=11, base_family="Sans Serif") {
138
+ ret <- theme_bw(base_size, base_family) %+replace%
139
+ theme(panel.background = element_rect(fill="#ffffff", colour=NA),
140
+ axis.title.x=element_text(vjust=-0.2), axis.title.y=element_text(vjust=1.5),
141
+ title=element_text(vjust=1.2, family="Source Sans Pro Semibold"),
142
+ panel.border = element_blank(), axis.line=element_blank(),
143
+ panel.grid.minor=element_blank(),
144
+ panel.grid.major.y = element_blank(),
145
+ panel.grid.major.x = element_line(size=0.5, colour="grey80"),
146
+ axis.ticks=element_blank(),
147
+ legend.position="bottom",
148
+ axis.title=element_text(size=rel(1.2), family="Source Sans Pro Semibold"),
149
+ strip.text=element_text(size=rel(1.5), family="Source Sans Pro Semibold"),
150
+ strip.background=element_rect(fill="#ffffff", colour=NA),
151
+ panel.spacing.y=unit(1.5, "lines"),
152
+ legend.key = element_blank())
153
+
154
+ ret
155
+ }
156
+
157
+ tasks.long <- tasks %>%
158
+ gather(date.type, task.date, -c(Project, Task, id, Start.second, End.second)) %>%
159
+ arrange(date.type, task.date) %>%
160
+ mutate(id = factor(id, levels=rev(unique(id)), ordered=TRUE))
161
+
162
+ x.breaks <- seq(length(tasks$Task) + 0.5 - 3, 0, by=-3)
163
+
164
+ timeline <- ggplot(tasks.long, aes(y=id, yend=id, x=Start.second, xend=End.second, colour=Task)) +
165
+ geom_segment() +
166
+ geom_vline(xintercept=x.breaks, colour="grey80", linetype="dotted") +
167
+ guides(colour=guide_legend(title=NULL)) +
168
+ labs(x=NULL, y=NULL) +
169
+ theme_gantt() + theme(axis.text.x=element_text(angle=45, hjust=1))
170
+
171
+ rbbt.png_plot('#{plot}', 'plot(timeline)', width=#{width}, height=#{height}, pointsize=6)
172
+ EOF
173
+ end
174
+
175
+ if options[:plot_data]
176
+ data
177
+ else
178
+ stats
179
+ end
180
+
181
+ end
182
+ end