jobserver 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,619 @@
1
+ #
2
+ # Jobserver version 0.1.4
3
+ #
4
+ # Copyright (c) 2004 Christian Bang <cbang@web.de>
5
+ #
6
+ # This program is free software.
7
+ # You can distribute/modify this program under the same terms of ruby.
8
+ #
9
+ # The class +JobServer+ supplies capabilities to execute jobs on the
10
+ # local- or on remote hosts.
11
+ # * Jobs encapsulate the call of a command in a shell on a (remote) host.
12
+ # * Each client is controlled by a _worker_-thread on the server.
13
+ # When a client is idle it receives the next job from the queue.
14
+ # * Remote jobs will be launched using ssh. Therefore you must
15
+ # configure your ssh keys without authentification password.
16
+ # * A common home directory is _not_ needed for the clients.
17
+ # Different machine architectures as well as binaries are possible.
18
+ # * Data for the clients can be saved to files on the client machines.
19
+ # See store_data.
20
+ # == How to use the jobserver
21
+ # 1. Set up a default command to be launched in the client's working
22
+ # directory.
23
+ # 2. Write a client output handler that parses the output lines,
24
+ # retrieves the results and stores them for you.
25
+ # 3. Create your jobs, give them the parameter settings they need
26
+ # to run the client command. Give them additional data if you want.
27
+ # 4. Create a JobServer instance.
28
+ # 5. Declare the client machines you want to use.
29
+ # 6. Run the JobServer and wait for all jobs to finish.
30
+ # See an example in the description of +JobServer+ or have a look at
31
+ # the examples/ directory of the jobserver package.
32
+
33
+ require 'thread'
34
+ require 'monitor'
35
+ require 'tempfile'
36
+
37
+ ###### Job ####################################################################
38
+
39
+ # Use +Job+ objects to generate a _jobQueue_ array for the +JobServer+.
40
+ # See +JobServer+ for an example.
41
+ class Job
42
+ WAITING = 0
43
+ RUNNING = 1
44
+ FAILED = 2
45
+ SUCCESS = 3
46
+ @@default_client_command = ""
47
+ @@nicelevel = 19 # default process priority for the command
48
+ @@jobNumber = 0 # class variable to give each new job a new number
49
+ # Returns the default client command that will be used if none is specified in #new.
50
+ def Job::default_client_command() @@default_client_command end
51
+ # Sets the default client command that will be used if none is specified in #new.
52
+ def Job::default_client_command=(s) @@default_client_command = s end
53
+ # Returns the nice level (priority) used when the client is run. Default is 19.
54
+ # For further information about +nice+ see the man-pages for +nice+.
55
+ def Job::nicelevel() @@nicelevel end
56
+ # Sets the nice level (priority) used when the client is run. Default is 19.
57
+ # If you set it to nil, the +nice+ command will not be used.
58
+ def Job::nicelevel=(v) @@nicelevel = v end
59
+ @@verbose = 1
60
+ # Sets the verbose level. 0 means no output, > 0 means output. Default is 1.
61
+ def Job::verbose=(v) @@verbose = v end
62
+ # Returns the verbose level. 0 means no output, > 0 means output.
63
+ def Job::verbose(v) @@verbose end
64
+
65
+ # Is the string identifier of the job. Set in #new.
66
+ attr_reader :name
67
+ # Is an arbitrary object containing user data for the job. Set in #new.
68
+ attr_accessor :data
69
+ # A string with the name of the script/binary to call in order to
70
+ # execute the job. Set in +run+.
71
+ attr_reader :host
72
+ # Will be set when job is run and contains the working directory on the
73
+ # (remote) host. Set in +run+.
74
+ # You could change it in the +pre_run_handler+.
75
+ attr_accessor :working_directory
76
+ # The parameters for the +client_command+, set in #new.
77
+ attr_reader :params
78
+ # The command to be executed on the (remote) machine. Set in #new.
79
+ # This is either a string or a +Proc+ object (see #new).
80
+ attr_accessor :client_command
81
+ # The results object that may be modified in the different handlers.
82
+ # It is initialized with true. If it is false or nil in the end, the job is
83
+ # regarded as FAILED, otherwise as SUCCESS.
84
+ attr_accessor :results
85
+ # Returns the status of the execution of the job in one of the values:
86
+ # WAITING, RUNNING, FAILED, SUCCESS
87
+ attr_accessor :status
88
+ # A single job object or an array of jobs that must finish before this job can run.
89
+ # (This means, they need all status == SUCCESS.)
90
+ attr_accessor :dependencies
91
+ # Number of times the job failed to run with success
92
+ attr_reader :numTries
93
+ def number
94
+ @number
95
+ end
96
+ # list of host names the job failed on
97
+ attr_reader :failedOnHosts
98
+
99
+ # The arguments must be passed as a hash where the keys are the parameter names listed below.
100
+ # You need not write {}.
101
+ # +:name+:: is the string identifier of the job
102
+ # +:params+:: are the parameters for the command
103
+ # +:data+::
104
+ # is an arbitrary object containing user data for the job.
105
+ # It is available in the +pre_run_handler+ and can be accessed by the +data+
106
+ # attribute whenever a job object is given.
107
+ # +:dependencies+::
108
+ # A job or an array of jobs that must be finished first. Default is nil.
109
+ # Prevent circular or unfulfillable dependencies.
110
+ # +:client_command+::
111
+ # either a string with the name of the command to call in order to
112
+ # execute the job on the (remote) machine or a +Proc+ object. In the
113
+ # latter case it receives the job as argument. +Proc+ objects are
114
+ # executed on the server only. This should come in handy for some small
115
+ # tasks that depend on other jobs but note that it increases the load
116
+ # on the server. For +Proc+ commands no output handler will be called.
117
+ # A default value can be set for all new jobs by +default_client_command+.
118
+ # <tt>:pre_run_handler |job|</tt>::
119
+ # is a Proc object that is called before the command is run.
120
+ # <tt>:output_handler |file,job|</tt>::
121
+ # is a Proc object that handles the output of
122
+ # the command output. You have to read the lines by yourself: (file.gets).
123
+ # The default handler puts the file's lines to stdout.
124
+ # The user can collect/store the results in the <tt>job.results</tt> variable,
125
+ # that is +true+ by default.
126
+ # <tt>post_run_handler |job|</tt>::
127
+ # is a Proc object that is called after the command is run.
128
+ #
129
+ # See JobServer for an example.
130
+ def initialize(args)
131
+ raise(ArgumentError, "Hash arguments expected.",caller) unless args.kind_of?(Hash)
132
+ args.each_pair{|key,value| args[key.to_sym]= value} # allow string keys also
133
+ @@jobNumber += 1
134
+ @number = @@jobNumber
135
+ @name = args[:name]
136
+ @params = args[:params]
137
+ @data = args[:data]
138
+ @dependencies = args[:dependencies] || []
139
+ @dependencies = [@dependencies] unless @dependencies.is_a?(Array)
140
+ @client_command = args[:client_command] || @@default_client_command
141
+ raise(ArgumentError, "No command specified for job #{name}", caller) if @client_command == ""
142
+ @pre_run_handler = args[:pre_run_handler]
143
+ @output_handler = args[:output_handler] || Proc.new {|file,job| puts file.gets }
144
+ @post_run_handler = args[:post_run_handler]
145
+ @host = nil #hostname where the job is executed (nil if only scheduled)
146
+ @worker_name = nil
147
+ @working_directory = ""
148
+ @numTries = 0
149
+ @results = nil
150
+ @filesToRemove = [] #list: (host, filename) of files to be removed after run of job
151
+ @status = WAITING
152
+ @failedOnHosts = []
153
+ end
154
+
155
+ # Calls the +client_command+ on the given host.
156
+ # The job will be executed by a thread +worker_name+ in +working_directory+.
157
+ # If hostname != "localhost", the command is executed via ssh on the remote
158
+ # machine.
159
+ # Returns +@results+.
160
+ def run(hostname, worker_name, working_directory) # :nodoc:
161
+ @host = hostname
162
+ @worker_name = worker_name
163
+ @working_directory = working_directory || ""
164
+ @results = true # default result, can be changed by user
165
+ @status = RUNNING
166
+ @numTries += 1
167
+ @pre_run_handler.call(self) if @pre_run_handler
168
+ if @client_command.is_a?(Proc)
169
+ @host = "localhost"
170
+ @client_command.call(self) #run the Proc on the server (ignore that it should run remotely)
171
+ else
172
+ nice = if @@nicelevel and not (PLATFORM.downcase =~ /win/)
173
+ "nice -#{@@nicelevel}"
174
+ else
175
+ "" # don't use nice
176
+ end
177
+ chdir = (@working_directory != "") ? "cd #{quote(@working_directory)};" : ""
178
+ cmd = "#{chdir}#{nice} #{@client_command} #{@params}"
179
+ cmd = "ssh #{hostname} #{quote(cmd)}" if hostname != "localhost"
180
+ runCommand(cmd)
181
+ end
182
+ @post_run_handler.call(self) if @post_run_handler
183
+ failedOnHosts |= [hostname] unless @results
184
+ @status = @results ? Job::SUCCESS : Job::FAILED
185
+ return @results
186
+ end
187
+
188
+ # Do you want to store the data of an object in a file
189
+ # that will be accessible to the command? If yes, then use this method.
190
+ # +data+:: is an object that can be written with puts
191
+ # +filename+:: is relative to +working_directory+
192
+ # +temporary+:: determines if the file should be removed after the job has run.
193
+ # Note: you could use this method in +pre_run_handler+ even to create the client
194
+ # script before executing it.
195
+ def store_data(data, filename, temporary = true)
196
+ filename = File.join(@working_directory, filename)
197
+ if @host == "localhost" then
198
+ File.open(filename,"w") { |file| file.puts data }
199
+ else
200
+ tempFile = Tempfile.new("#{File.basename(filename)}.tmp")
201
+ tempFile.puts data
202
+ tempFile.close
203
+ copyToHost(tempFile.path, filename)
204
+ tempFile.close(true) # remove local temp file
205
+ end
206
+ @filesToRemove << filename if temporary
207
+ end
208
+
209
+ # Returns the current state of the job.
210
+ def to_s
211
+ s = "#{@name}: "
212
+ case status
213
+ when WAITING
214
+ s += "waiting"
215
+ when RUNNING
216
+ s += "running"
217
+ when SUCCESS
218
+ s += "finished"
219
+ when FAILED
220
+ s += "failed"
221
+ end
222
+ s += ", try #{@numTries}" if @numTries > 1
223
+ return s
224
+ end
225
+
226
+ # This is called by the jobserver in order to decide if the job should be run
227
+ # on the given host. Default behaviour is true (run on all hosts).
228
+ # You can override this function, e.g.:
229
+ # require 'jobserver'
230
+ #
231
+ # class Job
232
+ # def runsOnHost(hostname)
233
+ # case hostname
234
+ # when ...
235
+ # end
236
+ # end
237
+ # end
238
+ # Beware not to construct dead-lock situations when no host satisfies a constraint.
239
+ def runsOnHost(hostname)
240
+ true
241
+ end
242
+
243
+ # Returns a subset of the given hosts for which #runsOnHost is +true+.
244
+ def runsOnHosts(hosts)
245
+ hosts.select {|host| runsOnHost(host)}
246
+ end
247
+
248
+ protected
249
+ # Copy files to the remote(!) host.
250
+ # Perhaps a file will have to be created on the fly by the job creator.
251
+ # This method can be used e.g. in the +pre_run_handler+ if files are needed
252
+ # on the remote host for the command to work on.
253
+ # +source+ are the files on the local host
254
+ # +destination+ the destination directory on @host, relative to
255
+ # @working_directory. Used by +store_data+.
256
+ def copyToHost(source, destination)
257
+ system("scp -q #{source} #{@host}:#{@working_directory}/#{destination}")
258
+ end
259
+
260
+ protected
261
+ # Called by +run+
262
+ def runCommand(command)
263
+ puts "Running job ##{@number} on #{@worker_name}: #{command}" if @@verbose > 0
264
+ IO.popen(command, "r") do |cin|
265
+ until cin.eof?
266
+ if @output_handler
267
+ @output_handler.call(cin, self)
268
+ else
269
+ cin.gets # read the line
270
+ end
271
+ end
272
+ end
273
+ #remove temporary files
274
+ unless @filesToRemove.empty?
275
+ if @host == "localhost" then File.delete(*@filesToRemove)
276
+ else system("ssh #{@host} rm #{@filesToRemove.join(' ')}")
277
+ end
278
+ end
279
+ @worker_name = nil # mark the job as finished
280
+ end
281
+
282
+ # Quotes all shell-special characters in the command.
283
+ def quote(command)
284
+ command.gsub(/[|&;()\\<>'"]/) {|x| '\\'+x}
285
+ end
286
+ end
287
+
288
+ ###############################################################################
289
+
290
+ # This class is used internally by the JobServer to collect statistics for
291
+ # each host.
292
+ class HostStatistics # :nodoc:
293
+ attr_accessor :num_jobs_finished
294
+ def initialize
295
+ @num_jobs_started = 0
296
+ @num_jobs_finished = 0
297
+ @num_jobs_failed = 0
298
+ @userTime = 0.0
299
+ @startTime = 0.0
300
+ end
301
+
302
+ # is called before a job is run
303
+ def begin_update
304
+ @startTime = Time.new
305
+ @num_jobs_started += 1
306
+ end
307
+
308
+ # is called when a job has (successfully) terminated
309
+ def end_update(success)
310
+ @userTime += Time.new - @startTime
311
+ if success
312
+ @num_jobs_finished += 1
313
+ else
314
+ @num_jobs_failed += 1
315
+ end
316
+ end
317
+
318
+ # returns the current statistics
319
+ def to_s
320
+ avgTime = @num_jobs_finished > 0 ? '%.1f s' % (@userTime / @num_jobs_finished) : '<no jobs finished>'
321
+ s="Number of jobs finished: #{@num_jobs_finished}, average time per job: #{avgTime}"
322
+ if (diff=(@num_jobs_started-@num_jobs_failed-@num_jobs_finished)) > 0
323
+ s += ", #{diff} job#{(diff > 1) ? 's' : ''} running"
324
+ end
325
+ s += ", #{@num_jobs_failed} job#{(@num_jobs_failed > 1) ? 's' : ''} failed" if @num_jobs_failed > 0
326
+ s
327
+ end
328
+ end
329
+
330
+ ###### Job Server #############################################################
331
+
332
+
333
+ # == Usage example:
334
+ #
335
+ # === Create job handlers
336
+ # require 'jobserver'
337
+ # pre_run_handler = Proc.new do |job|
338
+ # puts "Running job #{job.name} on #{job.host}"
339
+ # # Initialize the results object as an empty array:
340
+ # job.results = []
341
+ # end
342
+ # output_handler = Proc.new do |file, job|
343
+ # line = file.gets
344
+ # job.results << $1 if line =~ /result: (.*)/
345
+ # end
346
+ # post_run_handler = Proc.new do |job|
347
+ # if job.results.empty?
348
+ # puts "Error executing job #{job.name} on #{job.host}.\n\t#{job}"
349
+ # else
350
+ # puts $result = job.results.join(",")
351
+ # end
352
+ # end
353
+ # === Create the jobs
354
+ # Job.default_client_command = "runclient"
355
+ # myJobQueue = []
356
+ # 10.times{|i| myJobQueue << Job.new(:name=>"job#{i}", :params=>"#{i}", :pre_run_handler => pre_run_handler,
357
+ # :output_handler=>output_handler, :post_run_handler=>post_run_handler)}
358
+ # === Create the server
359
+ # server = JobServer.new(myJobQueue, "~/work") #run 1 local worker implicitly
360
+ # server.add_ssh_worker("192.168.0.1", "~/work_sparc")
361
+ # server.add_ssh_worker("192.168.0.2", "~/work", 2)
362
+ # server.dumpStatistics
363
+ # server.serve # Wait until all jobs have finished
364
+ #
365
+ class JobServer
366
+ class Deadlock < Exception; end
367
+ # an array of the worker threads currently running
368
+ attr_reader :workers
369
+ # <tt>hostStats[hostname]</tt> returns the +HostStatistics+ object for the string +hostname+
370
+ attr_reader :hostStats
371
+
372
+ # Instantiates a new JobServer object and creates the given number of local clients,
373
+ # called _workers_.
374
+ # +jobQueue+::
375
+ # is an array of jobs of type Job.
376
+ # Jobs on the _left_ side of the array are processed first.
377
+ # Jobs that had errors during execution will be enqueued at the end of the queue.
378
+ # If you want to decide whether to re-enqueue a job that had errors you can replace
379
+ # the method #retryJob by your own.
380
+ # +local_working_directory+:: is the directory in which local clients are launched
381
+ # +numLocalWorkers+::
382
+ # is the number of client workers which run on the server itself (e.g. number of CPUs)
383
+ # +terminateWorkersWhenJobQueueEmpty+::
384
+ # false means, the workers continue to run, even if the queue is empty
385
+ # until #close was called. This can be used if you want to add jobs while
386
+ # others are running. The situation
387
+ # can occur that all remaining jobs are running and the queue is empty but
388
+ # you want to add more jobs.
389
+ def initialize(jobQueue, local_working_directory = "", numLocalWorkers = 1, terminateWorkersWhenJobQueueEmpty = true)
390
+ @jobQueue = jobQueue
391
+ @jobQueue.extend(MonitorMixin)
392
+ @initialQueueLength = @jobQueue.length
393
+ @jobsRunning = []
394
+ @jobsRunning.extend(MonitorMixin)
395
+ @noJobs_cond = @jobQueue.new_cond
396
+ @local_working_directory = local_working_directory
397
+ @workers = []
398
+ @hostStats = Hash.new
399
+ @hostStats.extend(MonitorMixin)
400
+ @terminateWorkersWhenJobQueueEmpty = terminateWorkersWhenJobQueueEmpty
401
+ @usedHosts = []
402
+ add_local_worker(numLocalWorkers)
403
+ end
404
+
405
+ # serve waits for all jobs to terminate and outputs statistics if verbose is true
406
+ def serve(verbose = true)
407
+ raise(Exception, "No workers registered but serve was called. The jobs can't be processed!",caller) if @workers.empty?
408
+ @workers.each{|worker| worker.wakeup} #wake all workers up
409
+ @workers.each{|worker| worker.join} #wait for all workers to finish
410
+
411
+ @dumpStatThread.wakeup if defined?(@dumpStatThread) and @dumpStatThread.status #wake up the statistics dumper
412
+
413
+ #output statistics
414
+ if verbose
415
+ puts "Host statistics:\n================"
416
+ @hostStats.to_a.sort{|x,y| x[0]<=>y[0]}.each{|host,stats| puts "#{host}: #{stats}"}
417
+ puts
418
+ end
419
+ end
420
+
421
+ # close must be called only when you have set terminateWorkersWhenJobQueueEmpty
422
+ # to false during instantiation. Then you tell the server that no further
423
+ # jobs will be added to the job queue.
424
+ # All worker threads that have been waiting for new jobs will terminate now.
425
+ # You should still wait for all workers to complete. Use *serve* to do so.
426
+ def close
427
+ @terminateWorkersWhenJobQueueEmpty = true
428
+ @noJobs_cond.signal
429
+ end
430
+
431
+ # Writes statistics for each host to the given file.
432
+ # If no directory is given then in the local working directory given to
433
+ # +new+ is used.
434
+ # +timeInSec+ is the time after which the current statistics will
435
+ # be written. Time set to 0 means, write only when all jobs have finished.
436
+ # When the jobserver terminates, the last state will be written however the time.
437
+ def dumpStatistics(filename = "jobserver_stats.txt", timeInSec=60)
438
+ filename = File.join(@local_working_directory, filename) if filename == File.basename(filename)
439
+
440
+ @dumpStatThread = Thread.new(timeInSec) do |sleepTime|
441
+ loop do
442
+ sleep(sleepTime)
443
+ File.open(filename,"w") do |file|
444
+ file.puts "Host statistics:\n================"
445
+ @hostStats.synchronize do
446
+ @hostStats.to_a.sort{|x,y| x[0]<=>y[0]}.each{|host,stats| file.puts "#{host}: #{stats}"}
447
+ end
448
+ unless @jobsRunning.empty?
449
+ @jobsRunning.synchronize do
450
+ file.puts "\nJobs running:\n============"
451
+ file.puts @jobsRunning.map {|job| "#{job.host}: #{job}" },""
452
+ end
453
+ end
454
+ unless @jobQueue.empty?
455
+ @jobQueue.synchronize do
456
+ file.puts s="Jobs in the queue: (#{@jobQueue.length}/#@initialQueueLength remaining)"
457
+ file.puts "="*s.length
458
+ file.puts @jobQueue
459
+ end
460
+ end
461
+ end
462
+ end
463
+ end
464
+ end
465
+
466
+ # appends a job at the end of the queue and informs the sleeping worker
467
+ # threads that new jobs are available.
468
+ def add_job(job)
469
+ @jobQueue << job
470
+ @noJobs_cond.broadcast #wake up workers that wait for new jobs
471
+ end
472
+ alias :<< :add_job
473
+
474
+ # Adds a worker thread that processes jobs on the local machine.
475
+ # This method is called automatically on object instantiation.
476
+ # +numWorkers+ indicates, how many workers should use this client. It should usually not
477
+ # exceed the number of CPUs that are available for the host.
478
+ # You may wish to set it to zero, if the server machine should not be used as a client
479
+ # as well.
480
+ def add_local_worker(numWorkers = 1)
481
+ add_worker("localhost", @local_working_directory, numWorkers)
482
+ end
483
+
484
+ # Adds a worker thread that processes jobs on a given remote host machine.
485
+ # +hostname+ can contain a username like: <tt>fool@192.168.0.1</tt>
486
+ # If +working_directory+ is not empty then the client command will be executed in the
487
+ # given directory.
488
+ # +numWorkers+ indicates, how many workers should use this client. It should usually not
489
+ # exceed the number of CPUs that are available for the host.
490
+ def add_ssh_worker(hostname, working_directory = "", numWorkers = 1)
491
+ hostname = "localhost" if ENV['HOSTNAME'] == hostname
492
+ add_worker(hostname, working_directory, numWorkers)
493
+ end
494
+
495
+ # Is called when +job+ couldn't be executed. Determines whether to try to rerun the job.
496
+ # Whether a job had errors or not depends on whether the +results+ object is false/nil
497
+ # or not. See #Job.new for further details about +results+.
498
+ # By default, +retryJob+ allows three tries until the job is given up.
499
+ # If you want another behaviour, override the method +retryJob+ by your own, e.g.:
500
+ # require 'jobserver'
501
+ #
502
+ # class JobServer
503
+ # def retryJob(job)
504
+ # puts "Job failed: #{job.name}"
505
+ # return false
506
+ # end
507
+ # end
508
+ def retryJob(job)
509
+ if job.numTries < 3
510
+ puts "FAILURE: Will try to run job later: #{job}"
511
+ true
512
+ end # else return false implicitly
513
+ end
514
+
515
+ # Removes and returns a Job from the queue using a FIFO strategy but
516
+ # considering the dependency constraints of the jobs. That means, only
517
+ # those jobs are chosen that have only jobs in their dependency list that
518
+ # are finished. The user has to guarantee that the dependency-graph has no
519
+ # cycles.
520
+ # Furthermore each job has a function :runsOnHost that you might want to
521
+ # override in order to decide whether the job is fit to run on the given host.
522
+ # It may happen that no host gets a job. This may result in a job that failed but
523
+ # that is cruical to the execution of others so that these cannot be executed any
524
+ # more. This deadlock situation raises an exception.
525
+ #
526
+ # When a job failed on a host, it tries to rerun on other hosts first (if there are any).
527
+ def getNextJob(hostname)
528
+ job = @jobQueue.find do |job|
529
+ # it is assumed that all jobs form the queue are WAITING
530
+ ok = (job.dependencies.length == job.dependencies.select{|dep| dep.status == Job::SUCCESS}.length and
531
+ job.runsOnHost(hostname))
532
+ # ok == true => job could be run on 'hostname'
533
+ # did the job fail on this host?
534
+ if ok and job.failedOnHosts.include?(hostname)
535
+ # if it failed already on some hosts but there are still machines it could run on and didn't fail on them yet,
536
+ # so run on those hosts first. If otherwise the job failed on all hosts, try the current host again.
537
+ ok = job.runsOnHosts(@usedHosts - job.failedOnHosts).empty?
538
+ end
539
+ ok
540
+ end
541
+ return @jobQueue.delete(job) # returns nil if job == nil
542
+ end
543
+
544
+ private
545
+ # Creates a new worker thread. This method is called by +add_local_worker+ and +add_ssh_worker+.
546
+ # The worker is put into a sleep state and will be woken up at Jobserver.serve
547
+ def add_worker(hostname, working_directory, numWorkers)
548
+ @usedHosts |= [hostname]
549
+ numWorkers.times do |i|
550
+ worker_name = hostname + ((numWorkers > 1) ? "_#{i}" : "")
551
+ @hostStats[worker_name] = HostStatistics.new
552
+ @workers << Thread.new do
553
+ Thread.current[:getNoJob] = false
554
+ Thread.stop
555
+ begin
556
+ until @jobQueue.empty? do
557
+ job = nil
558
+ results = nil
559
+ #sleep/wait until new jobs arrived or some jobs finished
560
+ @jobQueue.synchronize { @noJobs_cond.wait_while {
561
+ #while queue is empty and we don't want to terminate yet or
562
+ #there are not jobs that can run right now
563
+ dowait = ((@jobQueue.empty? and !@terminateWorkersWhenJobQueueEmpty) or
564
+ (!@jobQueue.empty? and (job = getNextJob(hostname)) == nil))
565
+ Thread.current[:getNoJob] = (job == nil)
566
+ if dowait
567
+ # Before we go to sleep, check:
568
+ # The queue is not empty and no worker gets a job? This means: deadlock!
569
+ if !@jobQueue.empty? and
570
+ @workers.inject(true) {|prod,worker| prod and worker[:getNoJob] } # true, if every worker has getNoJob==true
571
+ Thread.critical = true # halt the other threads during output
572
+ #the deadlock can be caused by unsatisfiable or circular dependencies
573
+ $stderr.puts "\nDeadlock! -- Current Job Queue (#{@jobQueue.length} jobs):\n========================================="
574
+ $stderr.puts @jobQueue
575
+ $stderr.puts "\nJobs with unsatisfied dependencies:"
576
+ for job in @jobQueue do
577
+ $stderr.puts "#{job.name}:"
578
+ unsatisfied = job.dependencies.select{|j| j.status != Job::SUCCESS }.join('\n ')
579
+ if unsatisfied == ""
580
+ $stderr.puts " no unsatisfied jobs"
581
+ #job.dependencies.each{|j| puts " "+j.to_s}
582
+ else
583
+ $stderr.puts " "+unsatisfied.to_s
584
+ end
585
+ end
586
+ $stderr.puts
587
+ $stderr.puts "Before debugging the jobserver please verify, if"
588
+ $stderr.puts " a) no job on which a job form the job queue depends on failed"
589
+ $stderr.puts " b) the jobs form the job queue cannot be run on any host (if you redefine Job.runsOnHost)"
590
+ # $stderr.puts "No worker can get a job because of unsatisfied dependencies or host constraints."
591
+ Thread.critical = false
592
+ raise Deadlock, "JobServer deadlock: no worker can get a job but there are jobs in the queue."
593
+ exit
594
+ end
595
+ end
596
+ dowait
597
+ } }
598
+ if job # did we get a job? No could mean the queue is empty and we want to terminate
599
+ @jobsRunning.synchronize { @jobsRunning << job }
600
+ @hostStats.synchronize { @hostStats[worker_name].begin_update }
601
+ results = job.run(hostname, worker_name, working_directory)
602
+ @jobsRunning.synchronize { @jobsRunning.delete(job) }
603
+ #wake up workers that wait for new jobs, because some constraint/dependency could be satisfied now
604
+ @jobQueue.synchronize { @noJobs_cond.broadcast }
605
+
606
+ @hostStats.synchronize { @hostStats[worker_name].end_update(results) }
607
+ if results then
608
+ job.dependencies = nil # free them for the garbage collection
609
+ elsif retryJob(job)
610
+ job.status = Job::WAITING
611
+ @jobQueue.synchronize { @jobQueue << job }
612
+ end
613
+ end
614
+ end
615
+ end until @terminateWorkersWhenJobQueueEmpty
616
+ end
617
+ end
618
+ end
619
+ end