jobserver 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +58 -0
- data/README +58 -0
- data/examples/example1.rb +19 -0
- data/examples/example2.rb +66 -0
- data/examples/example3.rb +328 -0
- data/examples/jobserver.rb +619 -0
- data/lib/jobserver.rb +619 -0
- metadata +42 -0
data/lib/jobserver.rb
ADDED
@@ -0,0 +1,619 @@
|
|
1
|
+
#
|
2
|
+
# Jobserver version 0.1.4
|
3
|
+
#
|
4
|
+
# Copyright (c) 2004 Christian Bang <cbang@web.de>
|
5
|
+
#
|
6
|
+
# This program is free software.
|
7
|
+
# You can distribute/modify this program under the same terms of ruby.
|
8
|
+
#
|
9
|
+
# The class +JobServer+ supplies capabilities to execute jobs on the
|
10
|
+
# local- or on remote hosts.
|
11
|
+
# * Jobs encapsulate the call of a command in a shell on a (remote) host.
|
12
|
+
# * Each client is controlled by a _worker_-thread on the server.
|
13
|
+
# When a client is idle it receives the next job from the queue.
|
14
|
+
# * Remote jobs will be launched using ssh. Therefore you must
|
15
|
+
# configure your ssh keys without authentification password.
|
16
|
+
# * A common home directory is _not_ needed for the clients.
|
17
|
+
# Different machine architectures as well as binaries are possible.
|
18
|
+
# * Data for the clients can be saved to files on the client machines.
|
19
|
+
# See store_data.
|
20
|
+
# == How to use the jobserver
|
21
|
+
# 1. Set up a default command to be launched in the client's working
|
22
|
+
# directory.
|
23
|
+
# 2. Write a client output handler that parses the output lines,
|
24
|
+
# retrieves the results and stores them for you.
|
25
|
+
# 3. Create your jobs, give them the parameter settings they need
|
26
|
+
# to run the client command. Give them additional data if you want.
|
27
|
+
# 4. Create a JobServer instance.
|
28
|
+
# 5. Declare the client machines you want to use.
|
29
|
+
# 6. Run the JobServer and wait for all jobs to finish.
|
30
|
+
# See an example in the description of +JobServer+ or have a look at
|
31
|
+
# the examples/ directory of the jobserver package.
|
32
|
+
|
33
|
+
require 'thread'
|
34
|
+
require 'monitor'
|
35
|
+
require 'tempfile'
|
36
|
+
|
37
|
+
###### Job ####################################################################
|
38
|
+
|
39
|
+
# Use +Job+ objects to generate a _jobQueue_ array for the +JobServer+.
|
40
|
+
# See +JobServer+ for an example.
|
41
|
+
class Job
|
42
|
+
WAITING = 0
|
43
|
+
RUNNING = 1
|
44
|
+
FAILED = 2
|
45
|
+
SUCCESS = 3
|
46
|
+
@@default_client_command = ""
|
47
|
+
@@nicelevel = 19 # default process priority for the command
|
48
|
+
@@jobNumber = 0 # class variable to give each new job a new number
|
49
|
+
# Returns the default client command that will be used if none is specified in #new.
|
50
|
+
def Job::default_client_command() @@default_client_command end
|
51
|
+
# Sets the default client command that will be used if none is specified in #new.
|
52
|
+
def Job::default_client_command=(s) @@default_client_command = s end
|
53
|
+
# Returns the nice level (priority) used when the client is run. Default is 19.
|
54
|
+
# For further information about +nice+ see the man-pages for +nice+.
|
55
|
+
def Job::nicelevel() @@nicelevel end
|
56
|
+
# Sets the nice level (priority) used when the client is run. Default is 19.
|
57
|
+
# If you set it to nil, the +nice+ command will not be used.
|
58
|
+
def Job::nicelevel=(v) @@nicelevel = v end
|
59
|
+
@@verbose = 1
|
60
|
+
# Sets the verbose level. 0 means no output, > 0 means output. Default is 1.
|
61
|
+
def Job::verbose=(v) @@verbose = v end
|
62
|
+
# Returns the verbose level. 0 means no output, > 0 means output.
|
63
|
+
def Job::verbose(v) @@verbose end
|
64
|
+
|
65
|
+
# Is the string identifier of the job. Set in #new.
|
66
|
+
attr_reader :name
|
67
|
+
# Is an arbitrary object containing user data for the job. Set in #new.
|
68
|
+
attr_accessor :data
|
69
|
+
# A string with the name of the script/binary to call in order to
|
70
|
+
# execute the job. Set in +run+.
|
71
|
+
attr_reader :host
|
72
|
+
# Will be set when job is run and contains the working directory on the
|
73
|
+
# (remote) host. Set in +run+.
|
74
|
+
# You could change it in the +pre_run_handler+.
|
75
|
+
attr_accessor :working_directory
|
76
|
+
# The parameters for the +client_command+, set in #new.
|
77
|
+
attr_reader :params
|
78
|
+
# The command to be executed on the (remote) machine. Set in #new.
|
79
|
+
# This is either a string or a +Proc+ object (see #new).
|
80
|
+
attr_accessor :client_command
|
81
|
+
# The results object that may be modified in the different handlers.
|
82
|
+
# It is initialized with true. If it is false or nil in the end, the job is
|
83
|
+
# regarded as FAILED, otherwise as SUCCESS.
|
84
|
+
attr_accessor :results
|
85
|
+
# Returns the status of the execution of the job in one of the values:
|
86
|
+
# WAITING, RUNNING, FAILED, SUCCESS
|
87
|
+
attr_accessor :status
|
88
|
+
# A single job object or an array of jobs that must finish before this job can run.
|
89
|
+
# (This means, they need all status == SUCCESS.)
|
90
|
+
attr_accessor :dependencies
|
91
|
+
# Number of times the job failed to run with success
|
92
|
+
attr_reader :numTries
|
93
|
+
def number
|
94
|
+
@number
|
95
|
+
end
|
96
|
+
# list of host names the job failed on
|
97
|
+
attr_reader :failedOnHosts
|
98
|
+
|
99
|
+
# The arguments must be passed as a hash where the keys are the parameter names listed below.
|
100
|
+
# You need not write {}.
|
101
|
+
# +:name+:: is the string identifier of the job
|
102
|
+
# +:params+:: are the parameters for the command
|
103
|
+
# +:data+::
|
104
|
+
# is an arbitrary object containing user data for the job.
|
105
|
+
# It is available in the +pre_run_handler+ and can be accessed by the +data+
|
106
|
+
# attribute whenever a job object is given.
|
107
|
+
# +:dependencies+::
|
108
|
+
# A job or an array of jobs that must be finished first. Default is nil.
|
109
|
+
# Prevent circular or unfulfillable dependencies.
|
110
|
+
# +:client_command+::
|
111
|
+
# either a string with the name of the command to call in order to
|
112
|
+
# execute the job on the (remote) machine or a +Proc+ object. In the
|
113
|
+
# latter case it receives the job as argument. +Proc+ objects are
|
114
|
+
# executed on the server only. This should come in handy for some small
|
115
|
+
# tasks that depend on other jobs but note that it increases the load
|
116
|
+
# on the server. For +Proc+ commands no output handler will be called.
|
117
|
+
# A default value can be set for all new jobs by +default_client_command+.
|
118
|
+
# <tt>:pre_run_handler |job|</tt>::
|
119
|
+
# is a Proc object that is called before the command is run.
|
120
|
+
# <tt>:output_handler |file,job|</tt>::
|
121
|
+
# is a Proc object that handles the output of
|
122
|
+
# the command output. You have to read the lines by yourself: (file.gets).
|
123
|
+
# The default handler puts the file's lines to stdout.
|
124
|
+
# The user can collect/store the results in the <tt>job.results</tt> variable,
|
125
|
+
# that is +true+ by default.
|
126
|
+
# <tt>post_run_handler |job|</tt>::
|
127
|
+
# is a Proc object that is called after the command is run.
|
128
|
+
#
|
129
|
+
# See JobServer for an example.
|
130
|
+
def initialize(args)
|
131
|
+
raise(ArgumentError, "Hash arguments expected.",caller) unless args.kind_of?(Hash)
|
132
|
+
args.each_pair{|key,value| args[key.to_sym]= value} # allow string keys also
|
133
|
+
@@jobNumber += 1
|
134
|
+
@number = @@jobNumber
|
135
|
+
@name = args[:name]
|
136
|
+
@params = args[:params]
|
137
|
+
@data = args[:data]
|
138
|
+
@dependencies = args[:dependencies] || []
|
139
|
+
@dependencies = [@dependencies] unless @dependencies.is_a?(Array)
|
140
|
+
@client_command = args[:client_command] || @@default_client_command
|
141
|
+
raise(ArgumentError, "No command specified for job #{name}", caller) if @client_command == ""
|
142
|
+
@pre_run_handler = args[:pre_run_handler]
|
143
|
+
@output_handler = args[:output_handler] || Proc.new {|file,job| puts file.gets }
|
144
|
+
@post_run_handler = args[:post_run_handler]
|
145
|
+
@host = nil #hostname where the job is executed (nil if only scheduled)
|
146
|
+
@worker_name = nil
|
147
|
+
@working_directory = ""
|
148
|
+
@numTries = 0
|
149
|
+
@results = nil
|
150
|
+
@filesToRemove = [] #list: (host, filename) of files to be removed after run of job
|
151
|
+
@status = WAITING
|
152
|
+
@failedOnHosts = []
|
153
|
+
end
|
154
|
+
|
155
|
+
# Calls the +client_command+ on the given host.
|
156
|
+
# The job will be executed by a thread +worker_name+ in +working_directory+.
|
157
|
+
# If hostname != "localhost", the command is executed via ssh on the remote
|
158
|
+
# machine.
|
159
|
+
# Returns +@results+.
|
160
|
+
def run(hostname, worker_name, working_directory) # :nodoc:
|
161
|
+
@host = hostname
|
162
|
+
@worker_name = worker_name
|
163
|
+
@working_directory = working_directory || ""
|
164
|
+
@results = true # default result, can be changed by user
|
165
|
+
@status = RUNNING
|
166
|
+
@numTries += 1
|
167
|
+
@pre_run_handler.call(self) if @pre_run_handler
|
168
|
+
if @client_command.is_a?(Proc)
|
169
|
+
@host = "localhost"
|
170
|
+
@client_command.call(self) #run the Proc on the server (ignore that it should run remotely)
|
171
|
+
else
|
172
|
+
nice = if @@nicelevel and not (PLATFORM.downcase =~ /win/)
|
173
|
+
"nice -#{@@nicelevel}"
|
174
|
+
else
|
175
|
+
"" # don't use nice
|
176
|
+
end
|
177
|
+
chdir = (@working_directory != "") ? "cd #{quote(@working_directory)};" : ""
|
178
|
+
cmd = "#{chdir}#{nice} #{@client_command} #{@params}"
|
179
|
+
cmd = "ssh #{hostname} #{quote(cmd)}" if hostname != "localhost"
|
180
|
+
runCommand(cmd)
|
181
|
+
end
|
182
|
+
@post_run_handler.call(self) if @post_run_handler
|
183
|
+
failedOnHosts |= [hostname] unless @results
|
184
|
+
@status = @results ? Job::SUCCESS : Job::FAILED
|
185
|
+
return @results
|
186
|
+
end
|
187
|
+
|
188
|
+
# Do you want to store the data of an object in a file
|
189
|
+
# that will be accessible to the command? If yes, then use this method.
|
190
|
+
# +data+:: is an object that can be written with puts
|
191
|
+
# +filename+:: is relative to +working_directory+
|
192
|
+
# +temporary+:: determines if the file should be removed after the job has run.
|
193
|
+
# Note: you could use this method in +pre_run_handler+ even to create the client
|
194
|
+
# script before executing it.
|
195
|
+
def store_data(data, filename, temporary = true)
|
196
|
+
filename = File.join(@working_directory, filename)
|
197
|
+
if @host == "localhost" then
|
198
|
+
File.open(filename,"w") { |file| file.puts data }
|
199
|
+
else
|
200
|
+
tempFile = Tempfile.new("#{File.basename(filename)}.tmp")
|
201
|
+
tempFile.puts data
|
202
|
+
tempFile.close
|
203
|
+
copyToHost(tempFile.path, filename)
|
204
|
+
tempFile.close(true) # remove local temp file
|
205
|
+
end
|
206
|
+
@filesToRemove << filename if temporary
|
207
|
+
end
|
208
|
+
|
209
|
+
# Returns the current state of the job.
|
210
|
+
def to_s
|
211
|
+
s = "#{@name}: "
|
212
|
+
case status
|
213
|
+
when WAITING
|
214
|
+
s += "waiting"
|
215
|
+
when RUNNING
|
216
|
+
s += "running"
|
217
|
+
when SUCCESS
|
218
|
+
s += "finished"
|
219
|
+
when FAILED
|
220
|
+
s += "failed"
|
221
|
+
end
|
222
|
+
s += ", try #{@numTries}" if @numTries > 1
|
223
|
+
return s
|
224
|
+
end
|
225
|
+
|
226
|
+
# This is called by the jobserver in order to decide if the job should be run
|
227
|
+
# on the given host. Default behaviour is true (run on all hosts).
|
228
|
+
# You can override this function, e.g.:
|
229
|
+
# require 'jobserver'
|
230
|
+
#
|
231
|
+
# class Job
|
232
|
+
# def runsOnHost(hostname)
|
233
|
+
# case hostname
|
234
|
+
# when ...
|
235
|
+
# end
|
236
|
+
# end
|
237
|
+
# end
|
238
|
+
# Beware not to construct dead-lock situations when no host satisfies a constraint.
|
239
|
+
def runsOnHost(hostname)
|
240
|
+
true
|
241
|
+
end
|
242
|
+
|
243
|
+
# Returns a subset of the given hosts for which #runsOnHost is +true+.
|
244
|
+
def runsOnHosts(hosts)
|
245
|
+
hosts.select {|host| runsOnHost(host)}
|
246
|
+
end
|
247
|
+
|
248
|
+
protected
|
249
|
+
# Copy files to the remote(!) host.
|
250
|
+
# Perhaps a file will have to be created on the fly by the job creator.
|
251
|
+
# This method can be used e.g. in the +pre_run_handler+ if files are needed
|
252
|
+
# on the remote host for the command to work on.
|
253
|
+
# +source+ are the files on the local host
|
254
|
+
# +destination+ the destination directory on @host, relative to
|
255
|
+
# @working_directory. Used by +store_data+.
|
256
|
+
def copyToHost(source, destination)
|
257
|
+
system("scp -q #{source} #{@host}:#{@working_directory}/#{destination}")
|
258
|
+
end
|
259
|
+
|
260
|
+
protected
|
261
|
+
# Called by +run+
|
262
|
+
def runCommand(command)
|
263
|
+
puts "Running job ##{@number} on #{@worker_name}: #{command}" if @@verbose > 0
|
264
|
+
IO.popen(command, "r") do |cin|
|
265
|
+
until cin.eof?
|
266
|
+
if @output_handler
|
267
|
+
@output_handler.call(cin, self)
|
268
|
+
else
|
269
|
+
cin.gets # read the line
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
#remove temporary files
|
274
|
+
unless @filesToRemove.empty?
|
275
|
+
if @host == "localhost" then File.delete(*@filesToRemove)
|
276
|
+
else system("ssh #{@host} rm #{@filesToRemove.join(' ')}")
|
277
|
+
end
|
278
|
+
end
|
279
|
+
@worker_name = nil # mark the job as finished
|
280
|
+
end
|
281
|
+
|
282
|
+
# Quotes all shell-special characters in the command.
|
283
|
+
def quote(command)
|
284
|
+
command.gsub(/[|&;()\\<>'"]/) {|x| '\\'+x}
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
###############################################################################
|
289
|
+
|
290
|
+
# This class is used internally by the JobServer to collect statistics for
|
291
|
+
# each host.
|
292
|
+
class HostStatistics # :nodoc:
|
293
|
+
attr_accessor :num_jobs_finished
|
294
|
+
def initialize
|
295
|
+
@num_jobs_started = 0
|
296
|
+
@num_jobs_finished = 0
|
297
|
+
@num_jobs_failed = 0
|
298
|
+
@userTime = 0.0
|
299
|
+
@startTime = 0.0
|
300
|
+
end
|
301
|
+
|
302
|
+
# is called before a job is run
|
303
|
+
def begin_update
|
304
|
+
@startTime = Time.new
|
305
|
+
@num_jobs_started += 1
|
306
|
+
end
|
307
|
+
|
308
|
+
# is called when a job has (successfully) terminated
|
309
|
+
def end_update(success)
|
310
|
+
@userTime += Time.new - @startTime
|
311
|
+
if success
|
312
|
+
@num_jobs_finished += 1
|
313
|
+
else
|
314
|
+
@num_jobs_failed += 1
|
315
|
+
end
|
316
|
+
end
|
317
|
+
|
318
|
+
# returns the current statistics
|
319
|
+
def to_s
|
320
|
+
avgTime = @num_jobs_finished > 0 ? '%.1f s' % (@userTime / @num_jobs_finished) : '<no jobs finished>'
|
321
|
+
s="Number of jobs finished: #{@num_jobs_finished}, average time per job: #{avgTime}"
|
322
|
+
if (diff=(@num_jobs_started-@num_jobs_failed-@num_jobs_finished)) > 0
|
323
|
+
s += ", #{diff} job#{(diff > 1) ? 's' : ''} running"
|
324
|
+
end
|
325
|
+
s += ", #{@num_jobs_failed} job#{(@num_jobs_failed > 1) ? 's' : ''} failed" if @num_jobs_failed > 0
|
326
|
+
s
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
###### Job Server #############################################################
|
331
|
+
|
332
|
+
|
333
|
+
# == Usage example:
|
334
|
+
#
|
335
|
+
# === Create job handlers
|
336
|
+
# require 'jobserver'
|
337
|
+
# pre_run_handler = Proc.new do |job|
|
338
|
+
# puts "Running job #{job.name} on #{job.host}"
|
339
|
+
# # Initialize the results object as an empty array:
|
340
|
+
# job.results = []
|
341
|
+
# end
|
342
|
+
# output_handler = Proc.new do |file, job|
|
343
|
+
# line = file.gets
|
344
|
+
# job.results << $1 if line =~ /result: (.*)/
|
345
|
+
# end
|
346
|
+
# post_run_handler = Proc.new do |job|
|
347
|
+
# if job.results.empty?
|
348
|
+
# puts "Error executing job #{job.name} on #{job.host}.\n\t#{job}"
|
349
|
+
# else
|
350
|
+
# puts $result = job.results.join(",")
|
351
|
+
# end
|
352
|
+
# end
|
353
|
+
# === Create the jobs
|
354
|
+
# Job.default_client_command = "runclient"
|
355
|
+
# myJobQueue = []
|
356
|
+
# 10.times{|i| myJobQueue << Job.new(:name=>"job#{i}", :params=>"#{i}", :pre_run_handler => pre_run_handler,
|
357
|
+
# :output_handler=>output_handler, :post_run_handler=>post_run_handler)}
|
358
|
+
# === Create the server
|
359
|
+
# server = JobServer.new(myJobQueue, "~/work") #run 1 local worker implicitly
|
360
|
+
# server.add_ssh_worker("192.168.0.1", "~/work_sparc")
|
361
|
+
# server.add_ssh_worker("192.168.0.2", "~/work", 2)
|
362
|
+
# server.dumpStatistics
|
363
|
+
# server.serve # Wait until all jobs have finished
|
364
|
+
#
|
365
|
+
class JobServer
|
366
|
+
class Deadlock < Exception; end
|
367
|
+
# an array of the worker threads currently running
|
368
|
+
attr_reader :workers
|
369
|
+
# <tt>hostStats[hostname]</tt> returns the +HostStatistics+ object for the string +hostname+
|
370
|
+
attr_reader :hostStats
|
371
|
+
|
372
|
+
# Instantiates a new JobServer object and creates the given number of local clients,
|
373
|
+
# called _workers_.
|
374
|
+
# +jobQueue+::
|
375
|
+
# is an array of jobs of type Job.
|
376
|
+
# Jobs on the _left_ side of the array are processed first.
|
377
|
+
# Jobs that had errors during execution will be enqueued at the end of the queue.
|
378
|
+
# If you want to decide whether to re-enqueue a job that had errors you can replace
|
379
|
+
# the method #retryJob by your own.
|
380
|
+
# +local_working_directory+:: is the directory in which local clients are launched
|
381
|
+
# +numLocalWorkers+::
|
382
|
+
# is the number of client workers which run on the server itself (e.g. number of CPUs)
|
383
|
+
# +terminateWorkersWhenJobQueueEmpty+::
|
384
|
+
# false means, the workers continue to run, even if the queue is empty
|
385
|
+
# until #close was called. This can be used if you want to add jobs while
|
386
|
+
# others are running. The situation
|
387
|
+
# can occur that all remaining jobs are running and the queue is empty but
|
388
|
+
# you want to add more jobs.
|
389
|
+
def initialize(jobQueue, local_working_directory = "", numLocalWorkers = 1, terminateWorkersWhenJobQueueEmpty = true)
|
390
|
+
@jobQueue = jobQueue
|
391
|
+
@jobQueue.extend(MonitorMixin)
|
392
|
+
@initialQueueLength = @jobQueue.length
|
393
|
+
@jobsRunning = []
|
394
|
+
@jobsRunning.extend(MonitorMixin)
|
395
|
+
@noJobs_cond = @jobQueue.new_cond
|
396
|
+
@local_working_directory = local_working_directory
|
397
|
+
@workers = []
|
398
|
+
@hostStats = Hash.new
|
399
|
+
@hostStats.extend(MonitorMixin)
|
400
|
+
@terminateWorkersWhenJobQueueEmpty = terminateWorkersWhenJobQueueEmpty
|
401
|
+
@usedHosts = []
|
402
|
+
add_local_worker(numLocalWorkers)
|
403
|
+
end
|
404
|
+
|
405
|
+
# serve waits for all jobs to terminate and outputs statistics if verbose is true
|
406
|
+
def serve(verbose = true)
|
407
|
+
raise(Exception, "No workers registered but serve was called. The jobs can't be processed!",caller) if @workers.empty?
|
408
|
+
@workers.each{|worker| worker.wakeup} #wake all workers up
|
409
|
+
@workers.each{|worker| worker.join} #wait for all workers to finish
|
410
|
+
|
411
|
+
@dumpStatThread.wakeup if defined?(@dumpStatThread) and @dumpStatThread.status #wake up the statistics dumper
|
412
|
+
|
413
|
+
#output statistics
|
414
|
+
if verbose
|
415
|
+
puts "Host statistics:\n================"
|
416
|
+
@hostStats.to_a.sort{|x,y| x[0]<=>y[0]}.each{|host,stats| puts "#{host}: #{stats}"}
|
417
|
+
puts
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
# close must be called only when you have set terminateWorkersWhenJobQueueEmpty
|
422
|
+
# to false during instantiation. Then you tell the server that no further
|
423
|
+
# jobs will be added to the job queue.
|
424
|
+
# All worker threads that have been waiting for new jobs will terminate now.
|
425
|
+
# You should still wait for all workers to complete. Use *serve* to do so.
|
426
|
+
def close
|
427
|
+
@terminateWorkersWhenJobQueueEmpty = true
|
428
|
+
@noJobs_cond.signal
|
429
|
+
end
|
430
|
+
|
431
|
+
# Writes statistics for each host to the given file.
|
432
|
+
# If no directory is given then in the local working directory given to
|
433
|
+
# +new+ is used.
|
434
|
+
# +timeInSec+ is the time after which the current statistics will
|
435
|
+
# be written. Time set to 0 means, write only when all jobs have finished.
|
436
|
+
# When the jobserver terminates, the last state will be written however the time.
|
437
|
+
def dumpStatistics(filename = "jobserver_stats.txt", timeInSec=60)
|
438
|
+
filename = File.join(@local_working_directory, filename) if filename == File.basename(filename)
|
439
|
+
|
440
|
+
@dumpStatThread = Thread.new(timeInSec) do |sleepTime|
|
441
|
+
loop do
|
442
|
+
sleep(sleepTime)
|
443
|
+
File.open(filename,"w") do |file|
|
444
|
+
file.puts "Host statistics:\n================"
|
445
|
+
@hostStats.synchronize do
|
446
|
+
@hostStats.to_a.sort{|x,y| x[0]<=>y[0]}.each{|host,stats| file.puts "#{host}: #{stats}"}
|
447
|
+
end
|
448
|
+
unless @jobsRunning.empty?
|
449
|
+
@jobsRunning.synchronize do
|
450
|
+
file.puts "\nJobs running:\n============"
|
451
|
+
file.puts @jobsRunning.map {|job| "#{job.host}: #{job}" },""
|
452
|
+
end
|
453
|
+
end
|
454
|
+
unless @jobQueue.empty?
|
455
|
+
@jobQueue.synchronize do
|
456
|
+
file.puts s="Jobs in the queue: (#{@jobQueue.length}/#@initialQueueLength remaining)"
|
457
|
+
file.puts "="*s.length
|
458
|
+
file.puts @jobQueue
|
459
|
+
end
|
460
|
+
end
|
461
|
+
end
|
462
|
+
end
|
463
|
+
end
|
464
|
+
end
|
465
|
+
|
466
|
+
# appends a job at the end of the queue and informs the sleeping worker
|
467
|
+
# threads that new jobs are available.
|
468
|
+
def add_job(job)
|
469
|
+
@jobQueue << job
|
470
|
+
@noJobs_cond.broadcast #wake up workers that wait for new jobs
|
471
|
+
end
|
472
|
+
alias :<< :add_job
|
473
|
+
|
474
|
+
# Adds a worker thread that processes jobs on the local machine.
|
475
|
+
# This method is called automatically on object instantiation.
|
476
|
+
# +numWorkers+ indicates, how many workers should use this client. It should usually not
|
477
|
+
# exceed the number of CPUs that are available for the host.
|
478
|
+
# You may wish to set it to zero, if the server machine should not be used as a client
|
479
|
+
# as well.
|
480
|
+
def add_local_worker(numWorkers = 1)
|
481
|
+
add_worker("localhost", @local_working_directory, numWorkers)
|
482
|
+
end
|
483
|
+
|
484
|
+
# Adds a worker thread that processes jobs on a given remote host machine.
|
485
|
+
# +hostname+ can contain a username like: <tt>fool@192.168.0.1</tt>
|
486
|
+
# If +working_directory+ is not empty then the client command will be executed in the
|
487
|
+
# given directory.
|
488
|
+
# +numWorkers+ indicates, how many workers should use this client. It should usually not
|
489
|
+
# exceed the number of CPUs that are available for the host.
|
490
|
+
def add_ssh_worker(hostname, working_directory = "", numWorkers = 1)
|
491
|
+
hostname = "localhost" if ENV['HOSTNAME'] == hostname
|
492
|
+
add_worker(hostname, working_directory, numWorkers)
|
493
|
+
end
|
494
|
+
|
495
|
+
# Is called when +job+ couldn't be executed. Determines whether to try to rerun the job.
|
496
|
+
# Whether a job had errors or not depends on whether the +results+ object is false/nil
|
497
|
+
# or not. See #Job.new for further details about +results+.
|
498
|
+
# By default, +retryJob+ allows three tries until the job is given up.
|
499
|
+
# If you want another behaviour, override the method +retryJob+ by your own, e.g.:
|
500
|
+
# require 'jobserver'
|
501
|
+
#
|
502
|
+
# class JobServer
|
503
|
+
# def retryJob(job)
|
504
|
+
# puts "Job failed: #{job.name}"
|
505
|
+
# return false
|
506
|
+
# end
|
507
|
+
# end
|
508
|
+
def retryJob(job)
|
509
|
+
if job.numTries < 3
|
510
|
+
puts "FAILURE: Will try to run job later: #{job}"
|
511
|
+
true
|
512
|
+
end # else return false implicitly
|
513
|
+
end
|
514
|
+
|
515
|
+
# Removes and returns a Job from the queue using a FIFO strategy but
|
516
|
+
# considering the dependency constraints of the jobs. That means, only
|
517
|
+
# those jobs are chosen that have only jobs in their dependency list that
|
518
|
+
# are finished. The user has to guarantee that the dependency-graph has no
|
519
|
+
# cycles.
|
520
|
+
# Furthermore each job has a function :runsOnHost that you might want to
|
521
|
+
# override in order to decide whether the job is fit to run on the given host.
|
522
|
+
# It may happen that no host gets a job. This may result in a job that failed but
|
523
|
+
# that is cruical to the execution of others so that these cannot be executed any
|
524
|
+
# more. This deadlock situation raises an exception.
|
525
|
+
#
|
526
|
+
# When a job failed on a host, it tries to rerun on other hosts first (if there are any).
|
527
|
+
def getNextJob(hostname)
|
528
|
+
job = @jobQueue.find do |job|
|
529
|
+
# it is assumed that all jobs form the queue are WAITING
|
530
|
+
ok = (job.dependencies.length == job.dependencies.select{|dep| dep.status == Job::SUCCESS}.length and
|
531
|
+
job.runsOnHost(hostname))
|
532
|
+
# ok == true => job could be run on 'hostname'
|
533
|
+
# did the job fail on this host?
|
534
|
+
if ok and job.failedOnHosts.include?(hostname)
|
535
|
+
# if it failed already on some hosts but there are still machines it could run on and didn't fail on them yet,
|
536
|
+
# so run on those hosts first. If otherwise the job failed on all hosts, try the current host again.
|
537
|
+
ok = job.runsOnHosts(@usedHosts - job.failedOnHosts).empty?
|
538
|
+
end
|
539
|
+
ok
|
540
|
+
end
|
541
|
+
return @jobQueue.delete(job) # returns nil if job == nil
|
542
|
+
end
|
543
|
+
|
544
|
+
private
|
545
|
+
# Creates a new worker thread. This method is called by +add_local_worker+ and +add_ssh_worker+.
|
546
|
+
# The worker is put into a sleep state and will be woken up at Jobserver.serve
|
547
|
+
def add_worker(hostname, working_directory, numWorkers)
|
548
|
+
@usedHosts |= [hostname]
|
549
|
+
numWorkers.times do |i|
|
550
|
+
worker_name = hostname + ((numWorkers > 1) ? "_#{i}" : "")
|
551
|
+
@hostStats[worker_name] = HostStatistics.new
|
552
|
+
@workers << Thread.new do
|
553
|
+
Thread.current[:getNoJob] = false
|
554
|
+
Thread.stop
|
555
|
+
begin
|
556
|
+
until @jobQueue.empty? do
|
557
|
+
job = nil
|
558
|
+
results = nil
|
559
|
+
#sleep/wait until new jobs arrived or some jobs finished
|
560
|
+
@jobQueue.synchronize { @noJobs_cond.wait_while {
|
561
|
+
#while queue is empty and we don't want to terminate yet or
|
562
|
+
#there are not jobs that can run right now
|
563
|
+
dowait = ((@jobQueue.empty? and !@terminateWorkersWhenJobQueueEmpty) or
|
564
|
+
(!@jobQueue.empty? and (job = getNextJob(hostname)) == nil))
|
565
|
+
Thread.current[:getNoJob] = (job == nil)
|
566
|
+
if dowait
|
567
|
+
# Before we go to sleep, check:
|
568
|
+
# The queue is not empty and no worker gets a job? This means: deadlock!
|
569
|
+
if !@jobQueue.empty? and
|
570
|
+
@workers.inject(true) {|prod,worker| prod and worker[:getNoJob] } # true, if every worker has getNoJob==true
|
571
|
+
Thread.critical = true # halt the other threads during output
|
572
|
+
#the deadlock can be caused by unsatisfiable or circular dependencies
|
573
|
+
$stderr.puts "\nDeadlock! -- Current Job Queue (#{@jobQueue.length} jobs):\n========================================="
|
574
|
+
$stderr.puts @jobQueue
|
575
|
+
$stderr.puts "\nJobs with unsatisfied dependencies:"
|
576
|
+
for job in @jobQueue do
|
577
|
+
$stderr.puts "#{job.name}:"
|
578
|
+
unsatisfied = job.dependencies.select{|j| j.status != Job::SUCCESS }.join('\n ')
|
579
|
+
if unsatisfied == ""
|
580
|
+
$stderr.puts " no unsatisfied jobs"
|
581
|
+
#job.dependencies.each{|j| puts " "+j.to_s}
|
582
|
+
else
|
583
|
+
$stderr.puts " "+unsatisfied.to_s
|
584
|
+
end
|
585
|
+
end
|
586
|
+
$stderr.puts
|
587
|
+
$stderr.puts "Before debugging the jobserver please verify, if"
|
588
|
+
$stderr.puts " a) no job on which a job form the job queue depends on failed"
|
589
|
+
$stderr.puts " b) the jobs form the job queue cannot be run on any host (if you redefine Job.runsOnHost)"
|
590
|
+
# $stderr.puts "No worker can get a job because of unsatisfied dependencies or host constraints."
|
591
|
+
Thread.critical = false
|
592
|
+
raise Deadlock, "JobServer deadlock: no worker can get a job but there are jobs in the queue."
|
593
|
+
exit
|
594
|
+
end
|
595
|
+
end
|
596
|
+
dowait
|
597
|
+
} }
|
598
|
+
if job # did we get a job? No could mean the queue is empty and we want to terminate
|
599
|
+
@jobsRunning.synchronize { @jobsRunning << job }
|
600
|
+
@hostStats.synchronize { @hostStats[worker_name].begin_update }
|
601
|
+
results = job.run(hostname, worker_name, working_directory)
|
602
|
+
@jobsRunning.synchronize { @jobsRunning.delete(job) }
|
603
|
+
#wake up workers that wait for new jobs, because some constraint/dependency could be satisfied now
|
604
|
+
@jobQueue.synchronize { @noJobs_cond.broadcast }
|
605
|
+
|
606
|
+
@hostStats.synchronize { @hostStats[worker_name].end_update(results) }
|
607
|
+
if results then
|
608
|
+
job.dependencies = nil # free them for the garbage collection
|
609
|
+
elsif retryJob(job)
|
610
|
+
job.status = Job::WAITING
|
611
|
+
@jobQueue.synchronize { @jobQueue << job }
|
612
|
+
end
|
613
|
+
end
|
614
|
+
end
|
615
|
+
end until @terminateWorkersWhenJobQueueEmpty
|
616
|
+
end
|
617
|
+
end
|
618
|
+
end
|
619
|
+
end
|