jobserver 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,619 @@
1
+ #
2
+ # Jobserver version 0.1.4
3
+ #
4
+ # Copyright (c) 2004 Christian Bang <cbang@web.de>
5
+ #
6
+ # This program is free software.
7
+ # You can distribute/modify this program under the same terms of ruby.
8
+ #
9
+ # The class +JobServer+ supplies capabilities to execute jobs on the
10
+ # local- or on remote hosts.
11
+ # * Jobs encapsulate the call of a command in a shell on a (remote) host.
12
+ # * Each client is controlled by a _worker_-thread on the server.
13
+ # When a client is idle it receives the next job from the queue.
14
+ # * Remote jobs will be launched using ssh. Therefore you must
15
+ # configure your ssh keys without authentification password.
16
+ # * A common home directory is _not_ needed for the clients.
17
+ # Different machine architectures as well as binaries are possible.
18
+ # * Data for the clients can be saved to files on the client machines.
19
+ # See store_data.
20
+ # == How to use the jobserver
21
+ # 1. Set up a default command to be launched in the client's working
22
+ # directory.
23
+ # 2. Write a client output handler that parses the output lines,
24
+ # retrieves the results and stores them for you.
25
+ # 3. Create your jobs, give them the parameter settings they need
26
+ # to run the client command. Give them additional data if you want.
27
+ # 4. Create a JobServer instance.
28
+ # 5. Declare the client machines you want to use.
29
+ # 6. Run the JobServer and wait for all jobs to finish.
30
+ # See an example in the description of +JobServer+ or have a look at
31
+ # the examples/ directory of the jobserver package.
32
+
33
+ require 'thread'
34
+ require 'monitor'
35
+ require 'tempfile'
36
+
37
+ ###### Job ####################################################################
38
+
39
+ # Use +Job+ objects to generate a _jobQueue_ array for the +JobServer+.
40
+ # See +JobServer+ for an example.
41
+ class Job
42
+ WAITING = 0
43
+ RUNNING = 1
44
+ FAILED = 2
45
+ SUCCESS = 3
46
+ @@default_client_command = ""
47
+ @@nicelevel = 19 # default process priority for the command
48
+ @@jobNumber = 0 # class variable to give each new job a new number
49
+ # Returns the default client command that will be used if none is specified in #new.
50
+ def Job::default_client_command() @@default_client_command end
51
+ # Sets the default client command that will be used if none is specified in #new.
52
+ def Job::default_client_command=(s) @@default_client_command = s end
53
+ # Returns the nice level (priority) used when the client is run. Default is 19.
54
+ # For further information about +nice+ see the man-pages for +nice+.
55
+ def Job::nicelevel() @@nicelevel end
56
+ # Sets the nice level (priority) used when the client is run. Default is 19.
57
+ # If you set it to nil, the +nice+ command will not be used.
58
+ def Job::nicelevel=(v) @@nicelevel = v end
59
+ @@verbose = 1
60
+ # Sets the verbose level. 0 means no output, > 0 means output. Default is 1.
61
+ def Job::verbose=(v) @@verbose = v end
62
+ # Returns the verbose level. 0 means no output, > 0 means output.
63
+ def Job::verbose(v) @@verbose end
64
+
65
+ # Is the string identifier of the job. Set in #new.
66
+ attr_reader :name
67
+ # Is an arbitrary object containing user data for the job. Set in #new.
68
+ attr_accessor :data
69
+ # A string with the name of the script/binary to call in order to
70
+ # execute the job. Set in +run+.
71
+ attr_reader :host
72
+ # Will be set when job is run and contains the working directory on the
73
+ # (remote) host. Set in +run+.
74
+ # You could change it in the +pre_run_handler+.
75
+ attr_accessor :working_directory
76
+ # The parameters for the +client_command+, set in #new.
77
+ attr_reader :params
78
+ # The command to be executed on the (remote) machine. Set in #new.
79
+ # This is either a string or a +Proc+ object (see #new).
80
+ attr_accessor :client_command
81
+ # The results object that may be modified in the different handlers.
82
+ # It is initialized with true. If it is false or nil in the end, the job is
83
+ # regarded as FAILED, otherwise as SUCCESS.
84
+ attr_accessor :results
85
+ # Returns the status of the execution of the job in one of the values:
86
+ # WAITING, RUNNING, FAILED, SUCCESS
87
+ attr_accessor :status
88
+ # A single job object or an array of jobs that must finish before this job can run.
89
+ # (This means, they need all status == SUCCESS.)
90
+ attr_accessor :dependencies
91
+ # Number of times the job failed to run with success
92
+ attr_reader :numTries
93
+ def number
94
+ @number
95
+ end
96
+ # list of host names the job failed on
97
+ attr_reader :failedOnHosts
98
+
99
+ # The arguments must be passed as a hash where the keys are the parameter names listed below.
100
+ # You need not write {}.
101
+ # +:name+:: is the string identifier of the job
102
+ # +:params+:: are the parameters for the command
103
+ # +:data+::
104
+ # is an arbitrary object containing user data for the job.
105
+ # It is available in the +pre_run_handler+ and can be accessed by the +data+
106
+ # attribute whenever a job object is given.
107
+ # +:dependencies+::
108
+ # A job or an array of jobs that must be finished first. Default is nil.
109
+ # Prevent circular or unfulfillable dependencies.
110
+ # +:client_command+::
111
+ # either a string with the name of the command to call in order to
112
+ # execute the job on the (remote) machine or a +Proc+ object. In the
113
+ # latter case it receives the job as argument. +Proc+ objects are
114
+ # executed on the server only. This should come in handy for some small
115
+ # tasks that depend on other jobs but note that it increases the load
116
+ # on the server. For +Proc+ commands no output handler will be called.
117
+ # A default value can be set for all new jobs by +default_client_command+.
118
+ # <tt>:pre_run_handler |job|</tt>::
119
+ # is a Proc object that is called before the command is run.
120
+ # <tt>:output_handler |file,job|</tt>::
121
+ # is a Proc object that handles the output of
122
+ # the command output. You have to read the lines by yourself: (file.gets).
123
+ # The default handler puts the file's lines to stdout.
124
+ # The user can collect/store the results in the <tt>job.results</tt> variable,
125
+ # that is +true+ by default.
126
+ # <tt>post_run_handler |job|</tt>::
127
+ # is a Proc object that is called after the command is run.
128
+ #
129
+ # See JobServer for an example.
130
+ def initialize(args)
131
+ raise(ArgumentError, "Hash arguments expected.",caller) unless args.kind_of?(Hash)
132
+ args.each_pair{|key,value| args[key.to_sym]= value} # allow string keys also
133
+ @@jobNumber += 1
134
+ @number = @@jobNumber
135
+ @name = args[:name]
136
+ @params = args[:params]
137
+ @data = args[:data]
138
+ @dependencies = args[:dependencies] || []
139
+ @dependencies = [@dependencies] unless @dependencies.is_a?(Array)
140
+ @client_command = args[:client_command] || @@default_client_command
141
+ raise(ArgumentError, "No command specified for job #{name}", caller) if @client_command == ""
142
+ @pre_run_handler = args[:pre_run_handler]
143
+ @output_handler = args[:output_handler] || Proc.new {|file,job| puts file.gets }
144
+ @post_run_handler = args[:post_run_handler]
145
+ @host = nil #hostname where the job is executed (nil if only scheduled)
146
+ @worker_name = nil
147
+ @working_directory = ""
148
+ @numTries = 0
149
+ @results = nil
150
+ @filesToRemove = [] #list: (host, filename) of files to be removed after run of job
151
+ @status = WAITING
152
+ @failedOnHosts = []
153
+ end
154
+
155
+ # Calls the +client_command+ on the given host.
156
+ # The job will be executed by a thread +worker_name+ in +working_directory+.
157
+ # If hostname != "localhost", the command is executed via ssh on the remote
158
+ # machine.
159
+ # Returns +@results+.
160
+ def run(hostname, worker_name, working_directory) # :nodoc:
161
+ @host = hostname
162
+ @worker_name = worker_name
163
+ @working_directory = working_directory || ""
164
+ @results = true # default result, can be changed by user
165
+ @status = RUNNING
166
+ @numTries += 1
167
+ @pre_run_handler.call(self) if @pre_run_handler
168
+ if @client_command.is_a?(Proc)
169
+ @host = "localhost"
170
+ @client_command.call(self) #run the Proc on the server (ignore that it should run remotely)
171
+ else
172
+ nice = if @@nicelevel and not (PLATFORM.downcase =~ /win/)
173
+ "nice -#{@@nicelevel}"
174
+ else
175
+ "" # don't use nice
176
+ end
177
+ chdir = (@working_directory != "") ? "cd #{quote(@working_directory)};" : ""
178
+ cmd = "#{chdir}#{nice} #{@client_command} #{@params}"
179
+ cmd = "ssh #{hostname} #{quote(cmd)}" if hostname != "localhost"
180
+ runCommand(cmd)
181
+ end
182
+ @post_run_handler.call(self) if @post_run_handler
183
+ failedOnHosts |= [hostname] unless @results
184
+ @status = @results ? Job::SUCCESS : Job::FAILED
185
+ return @results
186
+ end
187
+
188
+ # Do you want to store the data of an object in a file
189
+ # that will be accessible to the command? If yes, then use this method.
190
+ # +data+:: is an object that can be written with puts
191
+ # +filename+:: is relative to +working_directory+
192
+ # +temporary+:: determines if the file should be removed after the job has run.
193
+ # Note: you could use this method in +pre_run_handler+ even to create the client
194
+ # script before executing it.
195
+ def store_data(data, filename, temporary = true)
196
+ filename = File.join(@working_directory, filename)
197
+ if @host == "localhost" then
198
+ File.open(filename,"w") { |file| file.puts data }
199
+ else
200
+ tempFile = Tempfile.new("#{File.basename(filename)}.tmp")
201
+ tempFile.puts data
202
+ tempFile.close
203
+ copyToHost(tempFile.path, filename)
204
+ tempFile.close(true) # remove local temp file
205
+ end
206
+ @filesToRemove << filename if temporary
207
+ end
208
+
209
+ # Returns the current state of the job.
210
+ def to_s
211
+ s = "#{@name}: "
212
+ case status
213
+ when WAITING
214
+ s += "waiting"
215
+ when RUNNING
216
+ s += "running"
217
+ when SUCCESS
218
+ s += "finished"
219
+ when FAILED
220
+ s += "failed"
221
+ end
222
+ s += ", try #{@numTries}" if @numTries > 1
223
+ return s
224
+ end
225
+
226
+ # This is called by the jobserver in order to decide if the job should be run
227
+ # on the given host. Default behaviour is true (run on all hosts).
228
+ # You can override this function, e.g.:
229
+ # require 'jobserver'
230
+ #
231
+ # class Job
232
+ # def runsOnHost(hostname)
233
+ # case hostname
234
+ # when ...
235
+ # end
236
+ # end
237
+ # end
238
+ # Beware not to construct dead-lock situations when no host satisfies a constraint.
239
+ def runsOnHost(hostname)
240
+ true
241
+ end
242
+
243
+ # Returns a subset of the given hosts for which #runsOnHost is +true+.
244
+ def runsOnHosts(hosts)
245
+ hosts.select {|host| runsOnHost(host)}
246
+ end
247
+
248
+ protected
249
+ # Copy files to the remote(!) host.
250
+ # Perhaps a file will have to be created on the fly by the job creator.
251
+ # This method can be used e.g. in the +pre_run_handler+ if files are needed
252
+ # on the remote host for the command to work on.
253
+ # +source+ are the files on the local host
254
+ # +destination+ the destination directory on @host, relative to
255
+ # @working_directory. Used by +store_data+.
256
+ def copyToHost(source, destination)
257
+ system("scp -q #{source} #{@host}:#{@working_directory}/#{destination}")
258
+ end
259
+
260
+ protected
261
+ # Called by +run+
262
+ def runCommand(command)
263
+ puts "Running job ##{@number} on #{@worker_name}: #{command}" if @@verbose > 0
264
+ IO.popen(command, "r") do |cin|
265
+ until cin.eof?
266
+ if @output_handler
267
+ @output_handler.call(cin, self)
268
+ else
269
+ cin.gets # read the line
270
+ end
271
+ end
272
+ end
273
+ #remove temporary files
274
+ unless @filesToRemove.empty?
275
+ if @host == "localhost" then File.delete(*@filesToRemove)
276
+ else system("ssh #{@host} rm #{@filesToRemove.join(' ')}")
277
+ end
278
+ end
279
+ @worker_name = nil # mark the job as finished
280
+ end
281
+
282
+ # Quotes all shell-special characters in the command.
283
+ def quote(command)
284
+ command.gsub(/[|&;()\\<>'"]/) {|x| '\\'+x}
285
+ end
286
+ end
287
+
288
+ ###############################################################################
289
+
290
+ # This class is used internally by the JobServer to collect statistics for
291
+ # each host.
292
+ class HostStatistics # :nodoc:
293
+ attr_accessor :num_jobs_finished
294
+ def initialize
295
+ @num_jobs_started = 0
296
+ @num_jobs_finished = 0
297
+ @num_jobs_failed = 0
298
+ @userTime = 0.0
299
+ @startTime = 0.0
300
+ end
301
+
302
+ # is called before a job is run
303
+ def begin_update
304
+ @startTime = Time.new
305
+ @num_jobs_started += 1
306
+ end
307
+
308
+ # is called when a job has (successfully) terminated
309
+ def end_update(success)
310
+ @userTime += Time.new - @startTime
311
+ if success
312
+ @num_jobs_finished += 1
313
+ else
314
+ @num_jobs_failed += 1
315
+ end
316
+ end
317
+
318
+ # returns the current statistics
319
+ def to_s
320
+ avgTime = @num_jobs_finished > 0 ? '%.1f s' % (@userTime / @num_jobs_finished) : '<no jobs finished>'
321
+ s="Number of jobs finished: #{@num_jobs_finished}, average time per job: #{avgTime}"
322
+ if (diff=(@num_jobs_started-@num_jobs_failed-@num_jobs_finished)) > 0
323
+ s += ", #{diff} job#{(diff > 1) ? 's' : ''} running"
324
+ end
325
+ s += ", #{@num_jobs_failed} job#{(@num_jobs_failed > 1) ? 's' : ''} failed" if @num_jobs_failed > 0
326
+ s
327
+ end
328
+ end
329
+
330
+ ###### Job Server #############################################################
331
+
332
+
333
+ # == Usage example:
334
+ #
335
+ # === Create job handlers
336
+ # require 'jobserver'
337
+ # pre_run_handler = Proc.new do |job|
338
+ # puts "Running job #{job.name} on #{job.host}"
339
+ # # Initialize the results object as an empty array:
340
+ # job.results = []
341
+ # end
342
+ # output_handler = Proc.new do |file, job|
343
+ # line = file.gets
344
+ # job.results << $1 if line =~ /result: (.*)/
345
+ # end
346
+ # post_run_handler = Proc.new do |job|
347
+ # if job.results.empty?
348
+ # puts "Error executing job #{job.name} on #{job.host}.\n\t#{job}"
349
+ # else
350
+ # puts $result = job.results.join(",")
351
+ # end
352
+ # end
353
+ # === Create the jobs
354
+ # Job.default_client_command = "runclient"
355
+ # myJobQueue = []
356
+ # 10.times{|i| myJobQueue << Job.new(:name=>"job#{i}", :params=>"#{i}", :pre_run_handler => pre_run_handler,
357
+ # :output_handler=>output_handler, :post_run_handler=>post_run_handler)}
358
+ # === Create the server
359
+ # server = JobServer.new(myJobQueue, "~/work") #run 1 local worker implicitly
360
+ # server.add_ssh_worker("192.168.0.1", "~/work_sparc")
361
+ # server.add_ssh_worker("192.168.0.2", "~/work", 2)
362
+ # server.dumpStatistics
363
+ # server.serve # Wait until all jobs have finished
364
+ #
365
+ class JobServer
366
+ class Deadlock < Exception; end
367
+ # an array of the worker threads currently running
368
+ attr_reader :workers
369
+ # <tt>hostStats[hostname]</tt> returns the +HostStatistics+ object for the string +hostname+
370
+ attr_reader :hostStats
371
+
372
+ # Instantiates a new JobServer object and creates the given number of local clients,
373
+ # called _workers_.
374
+ # +jobQueue+::
375
+ # is an array of jobs of type Job.
376
+ # Jobs on the _left_ side of the array are processed first.
377
+ # Jobs that had errors during execution will be enqueued at the end of the queue.
378
+ # If you want to decide whether to re-enqueue a job that had errors you can replace
379
+ # the method #retryJob by your own.
380
+ # +local_working_directory+:: is the directory in which local clients are launched
381
+ # +numLocalWorkers+::
382
+ # is the number of client workers which run on the server itself (e.g. number of CPUs)
383
+ # +terminateWorkersWhenJobQueueEmpty+::
384
+ # false means, the workers continue to run, even if the queue is empty
385
+ # until #close was called. This can be used if you want to add jobs while
386
+ # others are running. The situation
387
+ # can occur that all remaining jobs are running and the queue is empty but
388
+ # you want to add more jobs.
389
+ def initialize(jobQueue, local_working_directory = "", numLocalWorkers = 1, terminateWorkersWhenJobQueueEmpty = true)
390
+ @jobQueue = jobQueue
391
+ @jobQueue.extend(MonitorMixin)
392
+ @initialQueueLength = @jobQueue.length
393
+ @jobsRunning = []
394
+ @jobsRunning.extend(MonitorMixin)
395
+ @noJobs_cond = @jobQueue.new_cond
396
+ @local_working_directory = local_working_directory
397
+ @workers = []
398
+ @hostStats = Hash.new
399
+ @hostStats.extend(MonitorMixin)
400
+ @terminateWorkersWhenJobQueueEmpty = terminateWorkersWhenJobQueueEmpty
401
+ @usedHosts = []
402
+ add_local_worker(numLocalWorkers)
403
+ end
404
+
405
+ # serve waits for all jobs to terminate and outputs statistics if verbose is true
406
+ def serve(verbose = true)
407
+ raise(Exception, "No workers registered but serve was called. The jobs can't be processed!",caller) if @workers.empty?
408
+ @workers.each{|worker| worker.wakeup} #wake all workers up
409
+ @workers.each{|worker| worker.join} #wait for all workers to finish
410
+
411
+ @dumpStatThread.wakeup if defined?(@dumpStatThread) and @dumpStatThread.status #wake up the statistics dumper
412
+
413
+ #output statistics
414
+ if verbose
415
+ puts "Host statistics:\n================"
416
+ @hostStats.to_a.sort{|x,y| x[0]<=>y[0]}.each{|host,stats| puts "#{host}: #{stats}"}
417
+ puts
418
+ end
419
+ end
420
+
421
+ # close must be called only when you have set terminateWorkersWhenJobQueueEmpty
422
+ # to false during instantiation. Then you tell the server that no further
423
+ # jobs will be added to the job queue.
424
+ # All worker threads that have been waiting for new jobs will terminate now.
425
+ # You should still wait for all workers to complete. Use *serve* to do so.
426
+ def close
427
+ @terminateWorkersWhenJobQueueEmpty = true
428
+ @noJobs_cond.signal
429
+ end
430
+
431
+ # Writes statistics for each host to the given file.
432
+ # If no directory is given then in the local working directory given to
433
+ # +new+ is used.
434
+ # +timeInSec+ is the time after which the current statistics will
435
+ # be written. Time set to 0 means, write only when all jobs have finished.
436
+ # When the jobserver terminates, the last state will be written however the time.
437
+ def dumpStatistics(filename = "jobserver_stats.txt", timeInSec=60)
438
+ filename = File.join(@local_working_directory, filename) if filename == File.basename(filename)
439
+
440
+ @dumpStatThread = Thread.new(timeInSec) do |sleepTime|
441
+ loop do
442
+ sleep(sleepTime)
443
+ File.open(filename,"w") do |file|
444
+ file.puts "Host statistics:\n================"
445
+ @hostStats.synchronize do
446
+ @hostStats.to_a.sort{|x,y| x[0]<=>y[0]}.each{|host,stats| file.puts "#{host}: #{stats}"}
447
+ end
448
+ unless @jobsRunning.empty?
449
+ @jobsRunning.synchronize do
450
+ file.puts "\nJobs running:\n============"
451
+ file.puts @jobsRunning.map {|job| "#{job.host}: #{job}" },""
452
+ end
453
+ end
454
+ unless @jobQueue.empty?
455
+ @jobQueue.synchronize do
456
+ file.puts s="Jobs in the queue: (#{@jobQueue.length}/#@initialQueueLength remaining)"
457
+ file.puts "="*s.length
458
+ file.puts @jobQueue
459
+ end
460
+ end
461
+ end
462
+ end
463
+ end
464
+ end
465
+
466
+ # appends a job at the end of the queue and informs the sleeping worker
467
+ # threads that new jobs are available.
468
+ def add_job(job)
469
+ @jobQueue << job
470
+ @noJobs_cond.broadcast #wake up workers that wait for new jobs
471
+ end
472
+ alias :<< :add_job
473
+
474
+ # Adds a worker thread that processes jobs on the local machine.
475
+ # This method is called automatically on object instantiation.
476
+ # +numWorkers+ indicates, how many workers should use this client. It should usually not
477
+ # exceed the number of CPUs that are available for the host.
478
+ # You may wish to set it to zero, if the server machine should not be used as a client
479
+ # as well.
480
+ def add_local_worker(numWorkers = 1)
481
+ add_worker("localhost", @local_working_directory, numWorkers)
482
+ end
483
+
484
+ # Adds a worker thread that processes jobs on a given remote host machine.
485
+ # +hostname+ can contain a username like: <tt>fool@192.168.0.1</tt>
486
+ # If +working_directory+ is not empty then the client command will be executed in the
487
+ # given directory.
488
+ # +numWorkers+ indicates, how many workers should use this client. It should usually not
489
+ # exceed the number of CPUs that are available for the host.
490
+ def add_ssh_worker(hostname, working_directory = "", numWorkers = 1)
491
+ hostname = "localhost" if ENV['HOSTNAME'] == hostname
492
+ add_worker(hostname, working_directory, numWorkers)
493
+ end
494
+
495
+ # Is called when +job+ couldn't be executed. Determines whether to try to rerun the job.
496
+ # Whether a job had errors or not depends on whether the +results+ object is false/nil
497
+ # or not. See #Job.new for further details about +results+.
498
+ # By default, +retryJob+ allows three tries until the job is given up.
499
+ # If you want another behaviour, override the method +retryJob+ by your own, e.g.:
500
+ # require 'jobserver'
501
+ #
502
+ # class JobServer
503
+ # def retryJob(job)
504
+ # puts "Job failed: #{job.name}"
505
+ # return false
506
+ # end
507
+ # end
508
+ def retryJob(job)
509
+ if job.numTries < 3
510
+ puts "FAILURE: Will try to run job later: #{job}"
511
+ true
512
+ end # else return false implicitly
513
+ end
514
+
515
+ # Removes and returns a Job from the queue using a FIFO strategy but
516
+ # considering the dependency constraints of the jobs. That means, only
517
+ # those jobs are chosen that have only jobs in their dependency list that
518
+ # are finished. The user has to guarantee that the dependency-graph has no
519
+ # cycles.
520
+ # Furthermore each job has a function :runsOnHost that you might want to
521
+ # override in order to decide whether the job is fit to run on the given host.
522
+ # It may happen that no host gets a job. This may result in a job that failed but
523
+ # that is cruical to the execution of others so that these cannot be executed any
524
+ # more. This deadlock situation raises an exception.
525
+ #
526
+ # When a job failed on a host, it tries to rerun on other hosts first (if there are any).
527
+ def getNextJob(hostname)
528
+ job = @jobQueue.find do |job|
529
+ # it is assumed that all jobs form the queue are WAITING
530
+ ok = (job.dependencies.length == job.dependencies.select{|dep| dep.status == Job::SUCCESS}.length and
531
+ job.runsOnHost(hostname))
532
+ # ok == true => job could be run on 'hostname'
533
+ # did the job fail on this host?
534
+ if ok and job.failedOnHosts.include?(hostname)
535
+ # if it failed already on some hosts but there are still machines it could run on and didn't fail on them yet,
536
+ # so run on those hosts first. If otherwise the job failed on all hosts, try the current host again.
537
+ ok = job.runsOnHosts(@usedHosts - job.failedOnHosts).empty?
538
+ end
539
+ ok
540
+ end
541
+ return @jobQueue.delete(job) # returns nil if job == nil
542
+ end
543
+
544
+ private
545
+ # Creates a new worker thread. This method is called by +add_local_worker+ and +add_ssh_worker+.
546
+ # The worker is put into a sleep state and will be woken up at Jobserver.serve
547
+ def add_worker(hostname, working_directory, numWorkers)
548
+ @usedHosts |= [hostname]
549
+ numWorkers.times do |i|
550
+ worker_name = hostname + ((numWorkers > 1) ? "_#{i}" : "")
551
+ @hostStats[worker_name] = HostStatistics.new
552
+ @workers << Thread.new do
553
+ Thread.current[:getNoJob] = false
554
+ Thread.stop
555
+ begin
556
+ until @jobQueue.empty? do
557
+ job = nil
558
+ results = nil
559
+ #sleep/wait until new jobs arrived or some jobs finished
560
+ @jobQueue.synchronize { @noJobs_cond.wait_while {
561
+ #while queue is empty and we don't want to terminate yet or
562
+ #there are not jobs that can run right now
563
+ dowait = ((@jobQueue.empty? and !@terminateWorkersWhenJobQueueEmpty) or
564
+ (!@jobQueue.empty? and (job = getNextJob(hostname)) == nil))
565
+ Thread.current[:getNoJob] = (job == nil)
566
+ if dowait
567
+ # Before we go to sleep, check:
568
+ # The queue is not empty and no worker gets a job? This means: deadlock!
569
+ if !@jobQueue.empty? and
570
+ @workers.inject(true) {|prod,worker| prod and worker[:getNoJob] } # true, if every worker has getNoJob==true
571
+ Thread.critical = true # halt the other threads during output
572
+ #the deadlock can be caused by unsatisfiable or circular dependencies
573
+ $stderr.puts "\nDeadlock! -- Current Job Queue (#{@jobQueue.length} jobs):\n========================================="
574
+ $stderr.puts @jobQueue
575
+ $stderr.puts "\nJobs with unsatisfied dependencies:"
576
+ for job in @jobQueue do
577
+ $stderr.puts "#{job.name}:"
578
+ unsatisfied = job.dependencies.select{|j| j.status != Job::SUCCESS }.join('\n ')
579
+ if unsatisfied == ""
580
+ $stderr.puts " no unsatisfied jobs"
581
+ #job.dependencies.each{|j| puts " "+j.to_s}
582
+ else
583
+ $stderr.puts " "+unsatisfied.to_s
584
+ end
585
+ end
586
+ $stderr.puts
587
+ $stderr.puts "Before debugging the jobserver please verify, if"
588
+ $stderr.puts " a) no job on which a job form the job queue depends on failed"
589
+ $stderr.puts " b) the jobs form the job queue cannot be run on any host (if you redefine Job.runsOnHost)"
590
+ # $stderr.puts "No worker can get a job because of unsatisfied dependencies or host constraints."
591
+ Thread.critical = false
592
+ raise Deadlock, "JobServer deadlock: no worker can get a job but there are jobs in the queue."
593
+ exit
594
+ end
595
+ end
596
+ dowait
597
+ } }
598
+ if job # did we get a job? No could mean the queue is empty and we want to terminate
599
+ @jobsRunning.synchronize { @jobsRunning << job }
600
+ @hostStats.synchronize { @hostStats[worker_name].begin_update }
601
+ results = job.run(hostname, worker_name, working_directory)
602
+ @jobsRunning.synchronize { @jobsRunning.delete(job) }
603
+ #wake up workers that wait for new jobs, because some constraint/dependency could be satisfied now
604
+ @jobQueue.synchronize { @noJobs_cond.broadcast }
605
+
606
+ @hostStats.synchronize { @hostStats[worker_name].end_update(results) }
607
+ if results then
608
+ job.dependencies = nil # free them for the garbage collection
609
+ elsif retryJob(job)
610
+ job.status = Job::WAITING
611
+ @jobQueue.synchronize { @jobQueue << job }
612
+ end
613
+ end
614
+ end
615
+ end until @terminateWorkersWhenJobQueueEmpty
616
+ end
617
+ end
618
+ end
619
+ end