jobserver 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +58 -0
- data/README +58 -0
- data/examples/example1.rb +19 -0
- data/examples/example2.rb +66 -0
- data/examples/example3.rb +328 -0
- data/examples/jobserver.rb +619 -0
- data/lib/jobserver.rb +619 -0
- metadata +42 -0
@@ -0,0 +1,619 @@
|
|
1
|
+
#
|
2
|
+
# Jobserver version 0.1.4
|
3
|
+
#
|
4
|
+
# Copyright (c) 2004 Christian Bang <cbang@web.de>
|
5
|
+
#
|
6
|
+
# This program is free software.
|
7
|
+
# You can distribute/modify this program under the same terms of ruby.
|
8
|
+
#
|
9
|
+
# The class +JobServer+ supplies capabilities to execute jobs on the
|
10
|
+
# local- or on remote hosts.
|
11
|
+
# * Jobs encapsulate the call of a command in a shell on a (remote) host.
|
12
|
+
# * Each client is controlled by a _worker_-thread on the server.
|
13
|
+
# When a client is idle it receives the next job from the queue.
|
14
|
+
# * Remote jobs will be launched using ssh. Therefore you must
|
15
|
+
# configure your ssh keys without authentification password.
|
16
|
+
# * A common home directory is _not_ needed for the clients.
|
17
|
+
# Different machine architectures as well as binaries are possible.
|
18
|
+
# * Data for the clients can be saved to files on the client machines.
|
19
|
+
# See store_data.
|
20
|
+
# == How to use the jobserver
|
21
|
+
# 1. Set up a default command to be launched in the client's working
|
22
|
+
# directory.
|
23
|
+
# 2. Write a client output handler that parses the output lines,
|
24
|
+
# retrieves the results and stores them for you.
|
25
|
+
# 3. Create your jobs, give them the parameter settings they need
|
26
|
+
# to run the client command. Give them additional data if you want.
|
27
|
+
# 4. Create a JobServer instance.
|
28
|
+
# 5. Declare the client machines you want to use.
|
29
|
+
# 6. Run the JobServer and wait for all jobs to finish.
|
30
|
+
# See an example in the description of +JobServer+ or have a look at
|
31
|
+
# the examples/ directory of the jobserver package.
|
32
|
+
|
33
|
+
require 'thread'
|
34
|
+
require 'monitor'
|
35
|
+
require 'tempfile'
|
36
|
+
|
37
|
+
###### Job ####################################################################
|
38
|
+
|
39
|
+
# Use +Job+ objects to generate a _jobQueue_ array for the +JobServer+.
|
40
|
+
# See +JobServer+ for an example.
|
41
|
+
class Job
|
42
|
+
WAITING = 0
|
43
|
+
RUNNING = 1
|
44
|
+
FAILED = 2
|
45
|
+
SUCCESS = 3
|
46
|
+
@@default_client_command = ""
|
47
|
+
@@nicelevel = 19 # default process priority for the command
|
48
|
+
@@jobNumber = 0 # class variable to give each new job a new number
|
49
|
+
# Returns the default client command that will be used if none is specified in #new.
|
50
|
+
def Job::default_client_command() @@default_client_command end
|
51
|
+
# Sets the default client command that will be used if none is specified in #new.
|
52
|
+
def Job::default_client_command=(s) @@default_client_command = s end
|
53
|
+
# Returns the nice level (priority) used when the client is run. Default is 19.
|
54
|
+
# For further information about +nice+ see the man-pages for +nice+.
|
55
|
+
def Job::nicelevel() @@nicelevel end
|
56
|
+
# Sets the nice level (priority) used when the client is run. Default is 19.
|
57
|
+
# If you set it to nil, the +nice+ command will not be used.
|
58
|
+
def Job::nicelevel=(v) @@nicelevel = v end
|
59
|
+
@@verbose = 1
|
60
|
+
# Sets the verbose level. 0 means no output, > 0 means output. Default is 1.
|
61
|
+
def Job::verbose=(v) @@verbose = v end
|
62
|
+
# Returns the verbose level. 0 means no output, > 0 means output.
|
63
|
+
def Job::verbose(v) @@verbose end
|
64
|
+
|
65
|
+
# Is the string identifier of the job. Set in #new.
|
66
|
+
attr_reader :name
|
67
|
+
# Is an arbitrary object containing user data for the job. Set in #new.
|
68
|
+
attr_accessor :data
|
69
|
+
# A string with the name of the script/binary to call in order to
|
70
|
+
# execute the job. Set in +run+.
|
71
|
+
attr_reader :host
|
72
|
+
# Will be set when job is run and contains the working directory on the
|
73
|
+
# (remote) host. Set in +run+.
|
74
|
+
# You could change it in the +pre_run_handler+.
|
75
|
+
attr_accessor :working_directory
|
76
|
+
# The parameters for the +client_command+, set in #new.
|
77
|
+
attr_reader :params
|
78
|
+
# The command to be executed on the (remote) machine. Set in #new.
|
79
|
+
# This is either a string or a +Proc+ object (see #new).
|
80
|
+
attr_accessor :client_command
|
81
|
+
# The results object that may be modified in the different handlers.
|
82
|
+
# It is initialized with true. If it is false or nil in the end, the job is
|
83
|
+
# regarded as FAILED, otherwise as SUCCESS.
|
84
|
+
attr_accessor :results
|
85
|
+
# Returns the status of the execution of the job in one of the values:
|
86
|
+
# WAITING, RUNNING, FAILED, SUCCESS
|
87
|
+
attr_accessor :status
|
88
|
+
# A single job object or an array of jobs that must finish before this job can run.
|
89
|
+
# (This means, they need all status == SUCCESS.)
|
90
|
+
attr_accessor :dependencies
|
91
|
+
# Number of times the job failed to run with success
|
92
|
+
attr_reader :numTries
|
93
|
+
def number
|
94
|
+
@number
|
95
|
+
end
|
96
|
+
# list of host names the job failed on
|
97
|
+
attr_reader :failedOnHosts
|
98
|
+
|
99
|
+
# The arguments must be passed as a hash where the keys are the parameter names listed below.
|
100
|
+
# You need not write {}.
|
101
|
+
# +:name+:: is the string identifier of the job
|
102
|
+
# +:params+:: are the parameters for the command
|
103
|
+
# +:data+::
|
104
|
+
# is an arbitrary object containing user data for the job.
|
105
|
+
# It is available in the +pre_run_handler+ and can be accessed by the +data+
|
106
|
+
# attribute whenever a job object is given.
|
107
|
+
# +:dependencies+::
|
108
|
+
# A job or an array of jobs that must be finished first. Default is nil.
|
109
|
+
# Prevent circular or unfulfillable dependencies.
|
110
|
+
# +:client_command+::
|
111
|
+
# either a string with the name of the command to call in order to
|
112
|
+
# execute the job on the (remote) machine or a +Proc+ object. In the
|
113
|
+
# latter case it receives the job as argument. +Proc+ objects are
|
114
|
+
# executed on the server only. This should come in handy for some small
|
115
|
+
# tasks that depend on other jobs but note that it increases the load
|
116
|
+
# on the server. For +Proc+ commands no output handler will be called.
|
117
|
+
# A default value can be set for all new jobs by +default_client_command+.
|
118
|
+
# <tt>:pre_run_handler |job|</tt>::
|
119
|
+
# is a Proc object that is called before the command is run.
|
120
|
+
# <tt>:output_handler |file,job|</tt>::
|
121
|
+
# is a Proc object that handles the output of
|
122
|
+
# the command output. You have to read the lines by yourself: (file.gets).
|
123
|
+
# The default handler puts the file's lines to stdout.
|
124
|
+
# The user can collect/store the results in the <tt>job.results</tt> variable,
|
125
|
+
# that is +true+ by default.
|
126
|
+
# <tt>post_run_handler |job|</tt>::
|
127
|
+
# is a Proc object that is called after the command is run.
|
128
|
+
#
|
129
|
+
# See JobServer for an example.
|
130
|
+
def initialize(args)
|
131
|
+
raise(ArgumentError, "Hash arguments expected.",caller) unless args.kind_of?(Hash)
|
132
|
+
args.each_pair{|key,value| args[key.to_sym]= value} # allow string keys also
|
133
|
+
@@jobNumber += 1
|
134
|
+
@number = @@jobNumber
|
135
|
+
@name = args[:name]
|
136
|
+
@params = args[:params]
|
137
|
+
@data = args[:data]
|
138
|
+
@dependencies = args[:dependencies] || []
|
139
|
+
@dependencies = [@dependencies] unless @dependencies.is_a?(Array)
|
140
|
+
@client_command = args[:client_command] || @@default_client_command
|
141
|
+
raise(ArgumentError, "No command specified for job #{name}", caller) if @client_command == ""
|
142
|
+
@pre_run_handler = args[:pre_run_handler]
|
143
|
+
@output_handler = args[:output_handler] || Proc.new {|file,job| puts file.gets }
|
144
|
+
@post_run_handler = args[:post_run_handler]
|
145
|
+
@host = nil #hostname where the job is executed (nil if only scheduled)
|
146
|
+
@worker_name = nil
|
147
|
+
@working_directory = ""
|
148
|
+
@numTries = 0
|
149
|
+
@results = nil
|
150
|
+
@filesToRemove = [] #list: (host, filename) of files to be removed after run of job
|
151
|
+
@status = WAITING
|
152
|
+
@failedOnHosts = []
|
153
|
+
end
|
154
|
+
|
155
|
+
# Calls the +client_command+ on the given host.
|
156
|
+
# The job will be executed by a thread +worker_name+ in +working_directory+.
|
157
|
+
# If hostname != "localhost", the command is executed via ssh on the remote
|
158
|
+
# machine.
|
159
|
+
# Returns +@results+.
|
160
|
+
def run(hostname, worker_name, working_directory) # :nodoc:
|
161
|
+
@host = hostname
|
162
|
+
@worker_name = worker_name
|
163
|
+
@working_directory = working_directory || ""
|
164
|
+
@results = true # default result, can be changed by user
|
165
|
+
@status = RUNNING
|
166
|
+
@numTries += 1
|
167
|
+
@pre_run_handler.call(self) if @pre_run_handler
|
168
|
+
if @client_command.is_a?(Proc)
|
169
|
+
@host = "localhost"
|
170
|
+
@client_command.call(self) #run the Proc on the server (ignore that it should run remotely)
|
171
|
+
else
|
172
|
+
nice = if @@nicelevel and not (PLATFORM.downcase =~ /win/)
|
173
|
+
"nice -#{@@nicelevel}"
|
174
|
+
else
|
175
|
+
"" # don't use nice
|
176
|
+
end
|
177
|
+
chdir = (@working_directory != "") ? "cd #{quote(@working_directory)};" : ""
|
178
|
+
cmd = "#{chdir}#{nice} #{@client_command} #{@params}"
|
179
|
+
cmd = "ssh #{hostname} #{quote(cmd)}" if hostname != "localhost"
|
180
|
+
runCommand(cmd)
|
181
|
+
end
|
182
|
+
@post_run_handler.call(self) if @post_run_handler
|
183
|
+
failedOnHosts |= [hostname] unless @results
|
184
|
+
@status = @results ? Job::SUCCESS : Job::FAILED
|
185
|
+
return @results
|
186
|
+
end
|
187
|
+
|
188
|
+
# Do you want to store the data of an object in a file
|
189
|
+
# that will be accessible to the command? If yes, then use this method.
|
190
|
+
# +data+:: is an object that can be written with puts
|
191
|
+
# +filename+:: is relative to +working_directory+
|
192
|
+
# +temporary+:: determines if the file should be removed after the job has run.
|
193
|
+
# Note: you could use this method in +pre_run_handler+ even to create the client
|
194
|
+
# script before executing it.
|
195
|
+
def store_data(data, filename, temporary = true)
|
196
|
+
filename = File.join(@working_directory, filename)
|
197
|
+
if @host == "localhost" then
|
198
|
+
File.open(filename,"w") { |file| file.puts data }
|
199
|
+
else
|
200
|
+
tempFile = Tempfile.new("#{File.basename(filename)}.tmp")
|
201
|
+
tempFile.puts data
|
202
|
+
tempFile.close
|
203
|
+
copyToHost(tempFile.path, filename)
|
204
|
+
tempFile.close(true) # remove local temp file
|
205
|
+
end
|
206
|
+
@filesToRemove << filename if temporary
|
207
|
+
end
|
208
|
+
|
209
|
+
# Returns the current state of the job.
|
210
|
+
def to_s
|
211
|
+
s = "#{@name}: "
|
212
|
+
case status
|
213
|
+
when WAITING
|
214
|
+
s += "waiting"
|
215
|
+
when RUNNING
|
216
|
+
s += "running"
|
217
|
+
when SUCCESS
|
218
|
+
s += "finished"
|
219
|
+
when FAILED
|
220
|
+
s += "failed"
|
221
|
+
end
|
222
|
+
s += ", try #{@numTries}" if @numTries > 1
|
223
|
+
return s
|
224
|
+
end
|
225
|
+
|
226
|
+
# This is called by the jobserver in order to decide if the job should be run
|
227
|
+
# on the given host. Default behaviour is true (run on all hosts).
|
228
|
+
# You can override this function, e.g.:
|
229
|
+
# require 'jobserver'
|
230
|
+
#
|
231
|
+
# class Job
|
232
|
+
# def runsOnHost(hostname)
|
233
|
+
# case hostname
|
234
|
+
# when ...
|
235
|
+
# end
|
236
|
+
# end
|
237
|
+
# end
|
238
|
+
# Beware not to construct dead-lock situations when no host satisfies a constraint.
|
239
|
+
def runsOnHost(hostname)
|
240
|
+
true
|
241
|
+
end
|
242
|
+
|
243
|
+
# Returns a subset of the given hosts for which #runsOnHost is +true+.
|
244
|
+
def runsOnHosts(hosts)
|
245
|
+
hosts.select {|host| runsOnHost(host)}
|
246
|
+
end
|
247
|
+
|
248
|
+
protected
|
249
|
+
# Copy files to the remote(!) host.
|
250
|
+
# Perhaps a file will have to be created on the fly by the job creator.
|
251
|
+
# This method can be used e.g. in the +pre_run_handler+ if files are needed
|
252
|
+
# on the remote host for the command to work on.
|
253
|
+
# +source+ are the files on the local host
|
254
|
+
# +destination+ the destination directory on @host, relative to
|
255
|
+
# @working_directory. Used by +store_data+.
|
256
|
+
def copyToHost(source, destination)
|
257
|
+
system("scp -q #{source} #{@host}:#{@working_directory}/#{destination}")
|
258
|
+
end
|
259
|
+
|
260
|
+
protected
|
261
|
+
# Called by +run+
|
262
|
+
def runCommand(command)
|
263
|
+
puts "Running job ##{@number} on #{@worker_name}: #{command}" if @@verbose > 0
|
264
|
+
IO.popen(command, "r") do |cin|
|
265
|
+
until cin.eof?
|
266
|
+
if @output_handler
|
267
|
+
@output_handler.call(cin, self)
|
268
|
+
else
|
269
|
+
cin.gets # read the line
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
#remove temporary files
|
274
|
+
unless @filesToRemove.empty?
|
275
|
+
if @host == "localhost" then File.delete(*@filesToRemove)
|
276
|
+
else system("ssh #{@host} rm #{@filesToRemove.join(' ')}")
|
277
|
+
end
|
278
|
+
end
|
279
|
+
@worker_name = nil # mark the job as finished
|
280
|
+
end
|
281
|
+
|
282
|
+
# Quotes all shell-special characters in the command.
|
283
|
+
def quote(command)
|
284
|
+
command.gsub(/[|&;()\\<>'"]/) {|x| '\\'+x}
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
###############################################################################
|
289
|
+
|
290
|
+
# This class is used internally by the JobServer to collect statistics for
|
291
|
+
# each host.
|
292
|
+
class HostStatistics # :nodoc:
|
293
|
+
attr_accessor :num_jobs_finished
|
294
|
+
def initialize
|
295
|
+
@num_jobs_started = 0
|
296
|
+
@num_jobs_finished = 0
|
297
|
+
@num_jobs_failed = 0
|
298
|
+
@userTime = 0.0
|
299
|
+
@startTime = 0.0
|
300
|
+
end
|
301
|
+
|
302
|
+
# is called before a job is run
|
303
|
+
def begin_update
|
304
|
+
@startTime = Time.new
|
305
|
+
@num_jobs_started += 1
|
306
|
+
end
|
307
|
+
|
308
|
+
# is called when a job has (successfully) terminated
|
309
|
+
def end_update(success)
|
310
|
+
@userTime += Time.new - @startTime
|
311
|
+
if success
|
312
|
+
@num_jobs_finished += 1
|
313
|
+
else
|
314
|
+
@num_jobs_failed += 1
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
# returns the current statistics
|
319
|
+
def to_s
|
320
|
+
avgTime = @num_jobs_finished > 0 ? '%.1f s' % (@userTime / @num_jobs_finished) : '<no jobs finished>'
|
321
|
+
s="Number of jobs finished: #{@num_jobs_finished}, average time per job: #{avgTime}"
|
322
|
+
if (diff=(@num_jobs_started-@num_jobs_failed-@num_jobs_finished)) > 0
|
323
|
+
s += ", #{diff} job#{(diff > 1) ? 's' : ''} running"
|
324
|
+
end
|
325
|
+
s += ", #{@num_jobs_failed} job#{(@num_jobs_failed > 1) ? 's' : ''} failed" if @num_jobs_failed > 0
|
326
|
+
s
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
###### Job Server #############################################################
|
331
|
+
|
332
|
+
|
333
|
+
# == Usage example:
|
334
|
+
#
|
335
|
+
# === Create job handlers
|
336
|
+
# require 'jobserver'
|
337
|
+
# pre_run_handler = Proc.new do |job|
|
338
|
+
# puts "Running job #{job.name} on #{job.host}"
|
339
|
+
# # Initialize the results object as an empty array:
|
340
|
+
# job.results = []
|
341
|
+
# end
|
342
|
+
# output_handler = Proc.new do |file, job|
|
343
|
+
# line = file.gets
|
344
|
+
# job.results << $1 if line =~ /result: (.*)/
|
345
|
+
# end
|
346
|
+
# post_run_handler = Proc.new do |job|
|
347
|
+
# if job.results.empty?
|
348
|
+
# puts "Error executing job #{job.name} on #{job.host}.\n\t#{job}"
|
349
|
+
# else
|
350
|
+
# puts $result = job.results.join(",")
|
351
|
+
# end
|
352
|
+
# end
|
353
|
+
# === Create the jobs
|
354
|
+
# Job.default_client_command = "runclient"
|
355
|
+
# myJobQueue = []
|
356
|
+
# 10.times{|i| myJobQueue << Job.new(:name=>"job#{i}", :params=>"#{i}", :pre_run_handler => pre_run_handler,
|
357
|
+
# :output_handler=>output_handler, :post_run_handler=>post_run_handler)}
|
358
|
+
# === Create the server
|
359
|
+
# server = JobServer.new(myJobQueue, "~/work") #run 1 local worker implicitly
|
360
|
+
# server.add_ssh_worker("192.168.0.1", "~/work_sparc")
|
361
|
+
# server.add_ssh_worker("192.168.0.2", "~/work", 2)
|
362
|
+
# server.dumpStatistics
|
363
|
+
# server.serve # Wait until all jobs have finished
|
364
|
+
#
|
365
|
+
class JobServer
|
366
|
+
class Deadlock < Exception; end
|
367
|
+
# an array of the worker threads currently running
|
368
|
+
attr_reader :workers
|
369
|
+
# <tt>hostStats[hostname]</tt> returns the +HostStatistics+ object for the string +hostname+
|
370
|
+
attr_reader :hostStats
|
371
|
+
|
372
|
+
# Instantiates a new JobServer object and creates the given number of local clients,
|
373
|
+
# called _workers_.
|
374
|
+
# +jobQueue+::
|
375
|
+
# is an array of jobs of type Job.
|
376
|
+
# Jobs on the _left_ side of the array are processed first.
|
377
|
+
# Jobs that had errors during execution will be enqueued at the end of the queue.
|
378
|
+
# If you want to decide whether to re-enqueue a job that had errors you can replace
|
379
|
+
# the method #retryJob by your own.
|
380
|
+
# +local_working_directory+:: is the directory in which local clients are launched
|
381
|
+
# +numLocalWorkers+::
|
382
|
+
# is the number of client workers which run on the server itself (e.g. number of CPUs)
|
383
|
+
# +terminateWorkersWhenJobQueueEmpty+::
|
384
|
+
# false means, the workers continue to run, even if the queue is empty
|
385
|
+
# until #close was called. This can be used if you want to add jobs while
|
386
|
+
# others are running. The situation
|
387
|
+
# can occur that all remaining jobs are running and the queue is empty but
|
388
|
+
# you want to add more jobs.
|
389
|
+
def initialize(jobQueue, local_working_directory = "", numLocalWorkers = 1, terminateWorkersWhenJobQueueEmpty = true)
|
390
|
+
@jobQueue = jobQueue
|
391
|
+
@jobQueue.extend(MonitorMixin)
|
392
|
+
@initialQueueLength = @jobQueue.length
|
393
|
+
@jobsRunning = []
|
394
|
+
@jobsRunning.extend(MonitorMixin)
|
395
|
+
@noJobs_cond = @jobQueue.new_cond
|
396
|
+
@local_working_directory = local_working_directory
|
397
|
+
@workers = []
|
398
|
+
@hostStats = Hash.new
|
399
|
+
@hostStats.extend(MonitorMixin)
|
400
|
+
@terminateWorkersWhenJobQueueEmpty = terminateWorkersWhenJobQueueEmpty
|
401
|
+
@usedHosts = []
|
402
|
+
add_local_worker(numLocalWorkers)
|
403
|
+
end
|
404
|
+
|
405
|
+
# serve waits for all jobs to terminate and outputs statistics if verbose is true
|
406
|
+
def serve(verbose = true)
|
407
|
+
raise(Exception, "No workers registered but serve was called. The jobs can't be processed!",caller) if @workers.empty?
|
408
|
+
@workers.each{|worker| worker.wakeup} #wake all workers up
|
409
|
+
@workers.each{|worker| worker.join} #wait for all workers to finish
|
410
|
+
|
411
|
+
@dumpStatThread.wakeup if defined?(@dumpStatThread) and @dumpStatThread.status #wake up the statistics dumper
|
412
|
+
|
413
|
+
#output statistics
|
414
|
+
if verbose
|
415
|
+
puts "Host statistics:\n================"
|
416
|
+
@hostStats.to_a.sort{|x,y| x[0]<=>y[0]}.each{|host,stats| puts "#{host}: #{stats}"}
|
417
|
+
puts
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
# close must be called only when you have set terminateWorkersWhenJobQueueEmpty
|
422
|
+
# to false during instantiation. Then you tell the server that no further
|
423
|
+
# jobs will be added to the job queue.
|
424
|
+
# All worker threads that have been waiting for new jobs will terminate now.
|
425
|
+
# You should still wait for all workers to complete. Use *serve* to do so.
|
426
|
+
def close
|
427
|
+
@terminateWorkersWhenJobQueueEmpty = true
|
428
|
+
@noJobs_cond.signal
|
429
|
+
end
|
430
|
+
|
431
|
+
# Writes statistics for each host to the given file.
|
432
|
+
# If no directory is given then in the local working directory given to
|
433
|
+
# +new+ is used.
|
434
|
+
# +timeInSec+ is the time after which the current statistics will
|
435
|
+
# be written. Time set to 0 means, write only when all jobs have finished.
|
436
|
+
# When the jobserver terminates, the last state will be written however the time.
|
437
|
+
def dumpStatistics(filename = "jobserver_stats.txt", timeInSec=60)
|
438
|
+
filename = File.join(@local_working_directory, filename) if filename == File.basename(filename)
|
439
|
+
|
440
|
+
@dumpStatThread = Thread.new(timeInSec) do |sleepTime|
|
441
|
+
loop do
|
442
|
+
sleep(sleepTime)
|
443
|
+
File.open(filename,"w") do |file|
|
444
|
+
file.puts "Host statistics:\n================"
|
445
|
+
@hostStats.synchronize do
|
446
|
+
@hostStats.to_a.sort{|x,y| x[0]<=>y[0]}.each{|host,stats| file.puts "#{host}: #{stats}"}
|
447
|
+
end
|
448
|
+
unless @jobsRunning.empty?
|
449
|
+
@jobsRunning.synchronize do
|
450
|
+
file.puts "\nJobs running:\n============"
|
451
|
+
file.puts @jobsRunning.map {|job| "#{job.host}: #{job}" },""
|
452
|
+
end
|
453
|
+
end
|
454
|
+
unless @jobQueue.empty?
|
455
|
+
@jobQueue.synchronize do
|
456
|
+
file.puts s="Jobs in the queue: (#{@jobQueue.length}/#@initialQueueLength remaining)"
|
457
|
+
file.puts "="*s.length
|
458
|
+
file.puts @jobQueue
|
459
|
+
end
|
460
|
+
end
|
461
|
+
end
|
462
|
+
end
|
463
|
+
end
|
464
|
+
end
|
465
|
+
|
466
|
+
# appends a job at the end of the queue and informs the sleeping worker
|
467
|
+
# threads that new jobs are available.
|
468
|
+
def add_job(job)
|
469
|
+
@jobQueue << job
|
470
|
+
@noJobs_cond.broadcast #wake up workers that wait for new jobs
|
471
|
+
end
|
472
|
+
alias :<< :add_job
|
473
|
+
|
474
|
+
# Adds a worker thread that processes jobs on the local machine.
|
475
|
+
# This method is called automatically on object instantiation.
|
476
|
+
# +numWorkers+ indicates, how many workers should use this client. It should usually not
|
477
|
+
# exceed the number of CPUs that are available for the host.
|
478
|
+
# You may wish to set it to zero, if the server machine should not be used as a client
|
479
|
+
# as well.
|
480
|
+
def add_local_worker(numWorkers = 1)
|
481
|
+
add_worker("localhost", @local_working_directory, numWorkers)
|
482
|
+
end
|
483
|
+
|
484
|
+
# Adds a worker thread that processes jobs on a given remote host machine.
|
485
|
+
# +hostname+ can contain a username like: <tt>fool@192.168.0.1</tt>
|
486
|
+
# If +working_directory+ is not empty then the client command will be executed in the
|
487
|
+
# given directory.
|
488
|
+
# +numWorkers+ indicates, how many workers should use this client. It should usually not
|
489
|
+
# exceed the number of CPUs that are available for the host.
|
490
|
+
def add_ssh_worker(hostname, working_directory = "", numWorkers = 1)
|
491
|
+
hostname = "localhost" if ENV['HOSTNAME'] == hostname
|
492
|
+
add_worker(hostname, working_directory, numWorkers)
|
493
|
+
end
|
494
|
+
|
495
|
+
# Is called when +job+ couldn't be executed. Determines whether to try to rerun the job.
|
496
|
+
# Whether a job had errors or not depends on whether the +results+ object is false/nil
|
497
|
+
# or not. See #Job.new for further details about +results+.
|
498
|
+
# By default, +retryJob+ allows three tries until the job is given up.
|
499
|
+
# If you want another behaviour, override the method +retryJob+ by your own, e.g.:
|
500
|
+
# require 'jobserver'
|
501
|
+
#
|
502
|
+
# class JobServer
|
503
|
+
# def retryJob(job)
|
504
|
+
# puts "Job failed: #{job.name}"
|
505
|
+
# return false
|
506
|
+
# end
|
507
|
+
# end
|
508
|
+
def retryJob(job)
|
509
|
+
if job.numTries < 3
|
510
|
+
puts "FAILURE: Will try to run job later: #{job}"
|
511
|
+
true
|
512
|
+
end # else return false implicitly
|
513
|
+
end
|
514
|
+
|
515
|
+
# Removes and returns a Job from the queue using a FIFO strategy but
|
516
|
+
# considering the dependency constraints of the jobs. That means, only
|
517
|
+
# those jobs are chosen that have only jobs in their dependency list that
|
518
|
+
# are finished. The user has to guarantee that the dependency-graph has no
|
519
|
+
# cycles.
|
520
|
+
# Furthermore each job has a function :runsOnHost that you might want to
|
521
|
+
# override in order to decide whether the job is fit to run on the given host.
|
522
|
+
# It may happen that no host gets a job. This may result in a job that failed but
|
523
|
+
# that is cruical to the execution of others so that these cannot be executed any
|
524
|
+
# more. This deadlock situation raises an exception.
|
525
|
+
#
|
526
|
+
# When a job failed on a host, it tries to rerun on other hosts first (if there are any).
|
527
|
+
def getNextJob(hostname)
|
528
|
+
job = @jobQueue.find do |job|
|
529
|
+
# it is assumed that all jobs form the queue are WAITING
|
530
|
+
ok = (job.dependencies.length == job.dependencies.select{|dep| dep.status == Job::SUCCESS}.length and
|
531
|
+
job.runsOnHost(hostname))
|
532
|
+
# ok == true => job could be run on 'hostname'
|
533
|
+
# did the job fail on this host?
|
534
|
+
if ok and job.failedOnHosts.include?(hostname)
|
535
|
+
# if it failed already on some hosts but there are still machines it could run on and didn't fail on them yet,
|
536
|
+
# so run on those hosts first. If otherwise the job failed on all hosts, try the current host again.
|
537
|
+
ok = job.runsOnHosts(@usedHosts - job.failedOnHosts).empty?
|
538
|
+
end
|
539
|
+
ok
|
540
|
+
end
|
541
|
+
return @jobQueue.delete(job) # returns nil if job == nil
|
542
|
+
end
|
543
|
+
|
544
|
+
private
|
545
|
+
# Creates a new worker thread. This method is called by +add_local_worker+ and +add_ssh_worker+.
|
546
|
+
# The worker is put into a sleep state and will be woken up at Jobserver.serve
|
547
|
+
def add_worker(hostname, working_directory, numWorkers)
|
548
|
+
@usedHosts |= [hostname]
|
549
|
+
numWorkers.times do |i|
|
550
|
+
worker_name = hostname + ((numWorkers > 1) ? "_#{i}" : "")
|
551
|
+
@hostStats[worker_name] = HostStatistics.new
|
552
|
+
@workers << Thread.new do
|
553
|
+
Thread.current[:getNoJob] = false
|
554
|
+
Thread.stop
|
555
|
+
begin
|
556
|
+
until @jobQueue.empty? do
|
557
|
+
job = nil
|
558
|
+
results = nil
|
559
|
+
#sleep/wait until new jobs arrived or some jobs finished
|
560
|
+
@jobQueue.synchronize { @noJobs_cond.wait_while {
|
561
|
+
#while queue is empty and we don't want to terminate yet or
|
562
|
+
#there are not jobs that can run right now
|
563
|
+
dowait = ((@jobQueue.empty? and !@terminateWorkersWhenJobQueueEmpty) or
|
564
|
+
(!@jobQueue.empty? and (job = getNextJob(hostname)) == nil))
|
565
|
+
Thread.current[:getNoJob] = (job == nil)
|
566
|
+
if dowait
|
567
|
+
# Before we go to sleep, check:
|
568
|
+
# The queue is not empty and no worker gets a job? This means: deadlock!
|
569
|
+
if !@jobQueue.empty? and
|
570
|
+
@workers.inject(true) {|prod,worker| prod and worker[:getNoJob] } # true, if every worker has getNoJob==true
|
571
|
+
Thread.critical = true # halt the other threads during output
|
572
|
+
#the deadlock can be caused by unsatisfiable or circular dependencies
|
573
|
+
$stderr.puts "\nDeadlock! -- Current Job Queue (#{@jobQueue.length} jobs):\n========================================="
|
574
|
+
$stderr.puts @jobQueue
|
575
|
+
$stderr.puts "\nJobs with unsatisfied dependencies:"
|
576
|
+
for job in @jobQueue do
|
577
|
+
$stderr.puts "#{job.name}:"
|
578
|
+
unsatisfied = job.dependencies.select{|j| j.status != Job::SUCCESS }.join('\n ')
|
579
|
+
if unsatisfied == ""
|
580
|
+
$stderr.puts " no unsatisfied jobs"
|
581
|
+
#job.dependencies.each{|j| puts " "+j.to_s}
|
582
|
+
else
|
583
|
+
$stderr.puts " "+unsatisfied.to_s
|
584
|
+
end
|
585
|
+
end
|
586
|
+
$stderr.puts
|
587
|
+
$stderr.puts "Before debugging the jobserver please verify, if"
|
588
|
+
$stderr.puts " a) no job on which a job form the job queue depends on failed"
|
589
|
+
$stderr.puts " b) the jobs form the job queue cannot be run on any host (if you redefine Job.runsOnHost)"
|
590
|
+
# $stderr.puts "No worker can get a job because of unsatisfied dependencies or host constraints."
|
591
|
+
Thread.critical = false
|
592
|
+
raise Deadlock, "JobServer deadlock: no worker can get a job but there are jobs in the queue."
|
593
|
+
exit
|
594
|
+
end
|
595
|
+
end
|
596
|
+
dowait
|
597
|
+
} }
|
598
|
+
if job # did we get a job? No could mean the queue is empty and we want to terminate
|
599
|
+
@jobsRunning.synchronize { @jobsRunning << job }
|
600
|
+
@hostStats.synchronize { @hostStats[worker_name].begin_update }
|
601
|
+
results = job.run(hostname, worker_name, working_directory)
|
602
|
+
@jobsRunning.synchronize { @jobsRunning.delete(job) }
|
603
|
+
#wake up workers that wait for new jobs, because some constraint/dependency could be satisfied now
|
604
|
+
@jobQueue.synchronize { @noJobs_cond.broadcast }
|
605
|
+
|
606
|
+
@hostStats.synchronize { @hostStats[worker_name].end_update(results) }
|
607
|
+
if results then
|
608
|
+
job.dependencies = nil # free them for the garbage collection
|
609
|
+
elsif retryJob(job)
|
610
|
+
job.status = Job::WAITING
|
611
|
+
@jobQueue.synchronize { @jobQueue << job }
|
612
|
+
end
|
613
|
+
end
|
614
|
+
end
|
615
|
+
end until @terminateWorkersWhenJobQueueEmpty
|
616
|
+
end
|
617
|
+
end
|
618
|
+
end
|
619
|
+
end
|