jobserver 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +58 -0
- data/README +58 -0
- data/examples/example1.rb +19 -0
- data/examples/example2.rb +66 -0
- data/examples/example3.rb +328 -0
- data/examples/jobserver.rb +619 -0
- data/lib/jobserver.rb +619 -0
- metadata +42 -0
data/LICENSE.txt
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
Ruby is copyrighted free software by Yukihiro Matsumoto <matz@netlab.co.jp>.
|
2
|
+
You can redistribute it and/or modify it under either the terms of the GPL
|
3
|
+
(see COPYING.txt file), or the conditions below:
|
4
|
+
|
5
|
+
1. You may make and give away verbatim copies of the source form of the
|
6
|
+
software without restriction, provided that you duplicate all of the
|
7
|
+
original copyright notices and associated disclaimers.
|
8
|
+
|
9
|
+
2. You may modify your copy of the software in any way, provided that
|
10
|
+
you do at least ONE of the following:
|
11
|
+
|
12
|
+
a) place your modifications in the Public Domain or otherwise
|
13
|
+
make them Freely Available, such as by posting said
|
14
|
+
modifications to Usenet or an equivalent medium, or by allowing
|
15
|
+
the author to include your modifications in the software.
|
16
|
+
|
17
|
+
b) use the modified software only within your corporation or
|
18
|
+
organization.
|
19
|
+
|
20
|
+
c) rename any non-standard executables so the names do not conflict
|
21
|
+
with standard executables, which must also be provided.
|
22
|
+
|
23
|
+
d) make other distribution arrangements with the author.
|
24
|
+
|
25
|
+
3. You may distribute the software in object code or executable
|
26
|
+
form, provided that you do at least ONE of the following:
|
27
|
+
|
28
|
+
a) distribute the executables and library files of the software,
|
29
|
+
together with instructions (in the manual page or equivalent)
|
30
|
+
on where to get the original distribution.
|
31
|
+
|
32
|
+
b) accompany the distribution with the machine-readable source of
|
33
|
+
the software.
|
34
|
+
|
35
|
+
c) give non-standard executables non-standard names, with
|
36
|
+
instructions on where to get the original software distribution.
|
37
|
+
|
38
|
+
d) make other distribution arrangements with the author.
|
39
|
+
|
40
|
+
4. You may modify and include the part of the software into any other
|
41
|
+
software (possibly commercial). But some files in the distribution
|
42
|
+
are not written by the author, so that they are not under this terms.
|
43
|
+
|
44
|
+
They are gc.c(partly), utils.c(partly), regex.[ch], st.[ch] and some
|
45
|
+
files under the ./missing directory. See each file for the copying
|
46
|
+
condition.
|
47
|
+
|
48
|
+
5. The scripts and library files supplied as input to or produced as
|
49
|
+
output from the software do not automatically fall under the
|
50
|
+
copyright of the software, but belong to whomever generated them,
|
51
|
+
and may be sold commercially, and may be aggregated with this
|
52
|
+
software.
|
53
|
+
|
54
|
+
6. THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR
|
55
|
+
IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
|
56
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
57
|
+
PURPOSE.
|
58
|
+
|
data/README
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
JobServer README
|
2
|
+
================
|
3
|
+
|
4
|
+
The class JobServer supplies capabilities to execute jobs on the
|
5
|
+
local- or on remote hosts.
|
6
|
+
* Jobs encapsulate the call of a command in a shell on a (remote) host.
|
7
|
+
* Each client is controlled by a worker-thread on the server.
|
8
|
+
When a client is idle it receives the next job from the queue.
|
9
|
+
* Remote jobs will be launched using ssh. Therefore you must
|
10
|
+
configure your ssh keys without authentification password.
|
11
|
+
* A common home directory is not needed for the clients.
|
12
|
+
Different machine architectures as well as binaries are possible.
|
13
|
+
* Data for the clients can be saved to files on the client machines.
|
14
|
+
|
15
|
+
|
16
|
+
Requirements
|
17
|
+
------------
|
18
|
+
|
19
|
+
* Ruby 1.8
|
20
|
+
* ssh
|
21
|
+
|
22
|
+
Install
|
23
|
+
-------
|
24
|
+
|
25
|
+
De-compress archive and enter its top directory.
|
26
|
+
Then type:
|
27
|
+
|
28
|
+
($ su)
|
29
|
+
# ruby setup.rb
|
30
|
+
|
31
|
+
This simple step installs this program under the default
|
32
|
+
location of Ruby libraries. You can also install files into
|
33
|
+
your favorite directory by supplying setup.rb some options.
|
34
|
+
Try "ruby setup.rb --help".
|
35
|
+
|
36
|
+
|
37
|
+
Alternatively you can use the remote installer RubyGems
|
38
|
+
[http://rubygems.rubyforge.org/] for installation. Having RubyGems installed
|
39
|
+
on your system, just type:
|
40
|
+
|
41
|
+
($ su)
|
42
|
+
# gem install jobserver --remote
|
43
|
+
|
44
|
+
|
45
|
+
Usage
|
46
|
+
-----
|
47
|
+
|
48
|
+
In order to get an overview of the features you can generate
|
49
|
+
the RDoc documentation and have a look at the examples/ directory.
|
50
|
+
|
51
|
+
|
52
|
+
License
|
53
|
+
-------
|
54
|
+
|
55
|
+
Ruby License
|
56
|
+
|
57
|
+
|
58
|
+
Christian Bang, cbang AT web.de
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'jobserver'
|
2
|
+
|
3
|
+
# Create the jobs
|
4
|
+
myJobQueue = []
|
5
|
+
5.times{|i| myJobQueue << Job.new(:name=>"date job-#{i}", :client_command=> "date")}
|
6
|
+
5.times{|i| myJobQueue << Job.new(:name=>"short date job-#{i}", :client_command=> "date +%Y-%m-%d_%H.%M")}
|
7
|
+
# Create the server
|
8
|
+
server = JobServer.new(myJobQueue) #run one local worker implicitly
|
9
|
+
|
10
|
+
# You may add some remote machines to which you have ssh access with public key authentication without password.
|
11
|
+
# server.add_ssh_worker("192.168.0.1")
|
12
|
+
# server.add_ssh_worker("foo@mymachine.xy.org","",2)
|
13
|
+
|
14
|
+
# Dump out statistics on the progress.
|
15
|
+
# Default is to file "jobserver_stats.txt", every minute
|
16
|
+
server.dumpStatistics
|
17
|
+
|
18
|
+
# Wait until all jobs have finished
|
19
|
+
server.serve
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'jobserver'
|
2
|
+
|
3
|
+
# define the handlers for each job:
|
4
|
+
|
5
|
+
pre_run_handler = Proc.new do |job|
|
6
|
+
puts "Running job #{job.name} on #{job.host}"
|
7
|
+
# Initialize the results object as an empty array:
|
8
|
+
job.results = []
|
9
|
+
end
|
10
|
+
|
11
|
+
output_handler = Proc.new do |file, job|
|
12
|
+
line = file.gets
|
13
|
+
|
14
|
+
# Output the calculation, if the line contains one.
|
15
|
+
puts "Job #{job.name} has made the calculation: #{line}" if line =~ /=/
|
16
|
+
|
17
|
+
# Extract the date if the line contains one. Collect the results in the
|
18
|
+
# job.results variable.
|
19
|
+
job.results << $& if line =~ /\d?\d:\d\d:\d\d/
|
20
|
+
end
|
21
|
+
|
22
|
+
# Just another way of defining a handler. For this you must use method(:post_run_handler) later...
|
23
|
+
def post_run_handler(job)
|
24
|
+
if job.results.empty?
|
25
|
+
puts "Error executing job #{job.name} on #{job.host}.\n\t#{job}"
|
26
|
+
else
|
27
|
+
# Now that the job has finished, store the results of this job in the global
|
28
|
+
# list of results:
|
29
|
+
# You may store results e.g. in a global variable:
|
30
|
+
if $result[job.host]
|
31
|
+
$result[job.host] << job.results
|
32
|
+
else
|
33
|
+
$result[job.host] = []
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# execute the following commands for every job:
|
39
|
+
Job.default_client_command = "sleep 10;date;echo"
|
40
|
+
Job.nicelevel = nil # in this example we don't need nice
|
41
|
+
Job.verbose = 0 # print no messages about the launch of jobs
|
42
|
+
|
43
|
+
# Create the jobs
|
44
|
+
myJobQueue = []
|
45
|
+
10.times{|i| myJobQueue << Job.new(:name=>"job-#{i}", :params=>"5*#{i}=$((5*#{i}))", :pre_run_handler => pre_run_handler,
|
46
|
+
:output_handler=>output_handler, :post_run_handler=>method(:post_run_handler))}
|
47
|
+
# Create the server
|
48
|
+
server = JobServer.new(myJobQueue) #run one local worker implicitly
|
49
|
+
|
50
|
+
$result = {} # we will store results here
|
51
|
+
|
52
|
+
# You may add some remote machines to which you have ssh access with public key authentication without password.
|
53
|
+
# server.add_ssh_worker("192.168.0.1")
|
54
|
+
# server.add_ssh_worker("foo@mymachine.xy.org","",2)
|
55
|
+
|
56
|
+
# Dump out statistics on the progress.
|
57
|
+
# Default is to file "jobserver_stats.txt", every minute
|
58
|
+
server.dumpStatistics
|
59
|
+
|
60
|
+
# Wait until all jobs have finished
|
61
|
+
server.serve
|
62
|
+
|
63
|
+
puts "Times on the hosts when the jobs where run:"
|
64
|
+
for host,time in $result
|
65
|
+
puts "#{host}: #{time.join(', ')}"
|
66
|
+
end
|
@@ -0,0 +1,328 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
=begin
|
3
|
+
This non-executable example is an excerpt from a real application of the jobserver.
|
4
|
+
It was used to run experiments for combinatorial optimization problems.
|
5
|
+
This example derives a new class from Job which enhances the abilities of the jobserver by:
|
6
|
+
- robust detection of failed jobs
|
7
|
+
- jobs with various ressource-requirements will be assigned only to hosts that meet
|
8
|
+
their requirements
|
9
|
+
|
10
|
+
*Note*: This example assumes, that all hosts share a common home directory.
|
11
|
+
If you would like to have the original file on which this example is based on, write me:
|
12
|
+
Christian Bang, cbang@web.de
|
13
|
+
=end
|
14
|
+
|
15
|
+
############################# BEGIN PARAMETER SETTINGS ###################################
|
16
|
+
|
17
|
+
experiment = $*[0].to_i
|
18
|
+
# In this section you can add your own experiments. This way you can easily repeat an old experiment without having to
|
19
|
+
# change the parameters of the last experiment. The number of the experiment to run is given by command line.
|
20
|
+
case experiment # select the experiment you want to make here, or add a new one below
|
21
|
+
when 0
|
22
|
+
$stderr.puts "Please give the number of the experiment you want to run as a command line argument."
|
23
|
+
exit
|
24
|
+
when 1 ##### Experiment 1
|
25
|
+
# set variables which define the experiment
|
26
|
+
$param1_list = %w{1 3 5 7}
|
27
|
+
$param2_list = %w{xx xy yy}
|
28
|
+
$instances = %w{test01 test02}
|
29
|
+
when 2 ##### Experiment 2
|
30
|
+
# set variables which define the experiment
|
31
|
+
when 3 ##### Experiment 3
|
32
|
+
# set variables which define the experiment
|
33
|
+
else
|
34
|
+
raise "Undefined experiment"
|
35
|
+
end
|
36
|
+
puts "Running experiment ##{experiment}"
|
37
|
+
|
38
|
+
|
39
|
+
########################### now the environment dependant variables ####################
|
40
|
+
$projectDir = "#{ENV['HOME']}/project"
|
41
|
+
$experimentsDir = "#$projectDir/experiments" # where to store the experiments (in subdirectories)
|
42
|
+
$experimentName = "MY_PROJECT" # prefix for the job output filenames (if logging enabled), NOT the name of the binary
|
43
|
+
$workDir = "#$projectDir/src" # working directory for the processes
|
44
|
+
|
45
|
+
# useHosts is an array of the hosts you want to use for the experiment. If a host has multiple CPUs, all are used.
|
46
|
+
# For the definition of the hosts see the $knownHosts matrix below.
|
47
|
+
useHosts = %w{hannibal caesar homer asterix}
|
48
|
+
|
49
|
+
#JOB_LOGGING determines whether the output of each job should be kept in a log file (in $logDir)
|
50
|
+
JOB_LOGGING = false
|
51
|
+
|
52
|
+
# time when the exeperiment began. All log filenames contain this
|
53
|
+
$timestamp = `date +%Y-%m-%d_%H.%M`.chomp
|
54
|
+
|
55
|
+
$logDir = "#$experimentsDir/log.#{$experimentName}_#{$timestamp}"
|
56
|
+
|
57
|
+
############################# END PARAMETER SETTINGS ###################################
|
58
|
+
# usually you won't have to change anything below here
|
59
|
+
$LOAD_PATH << File.dirname($0)
|
60
|
+
require 'jobserver'
|
61
|
+
require 'fileutils'
|
62
|
+
|
63
|
+
# JobData has the following fields:
|
64
|
+
# :host_requirements:
|
65
|
+
# An OR concatenated list of REQUIRE_ variables like <tt>REQUIRE_RUBY|REQUIRE_GNUPLOT</tt>.
|
66
|
+
# A job is only run on a host fulfilling these constraints.
|
67
|
+
# :inputFiles:
|
68
|
+
# A string or an array of strings containing ames of files that are needed to run
|
69
|
+
# the job. If they don't exist, the job fails. But in case a gzipped input file is found,
|
70
|
+
# it is unzipped first. (gzipped again later, if another job does it)
|
71
|
+
# :outputFiles:
|
72
|
+
# A string or an array of strings containing names of files that will be created when the
|
73
|
+
# job is run. If all the files already exist and are not corrupt, the job is not run but
|
74
|
+
# marked as success. See next item for what corrupt means.
|
75
|
+
# :successToken:
|
76
|
+
# A string or an array of strings corresponding to an entry in +outputFiles+.
|
77
|
+
# If a successToken for the the output file is given and the file exists then a grep
|
78
|
+
# search for this token is made in the file. If the token was found, the file is deemed okay,
|
79
|
+
# else it is corrupt and will be deleted before execution of the job.
|
80
|
+
# :alwaysOverride: if true, existing output files will be overridden.
|
81
|
+
# :moreData: is some job-type specific data you can use for your own jobs. (E.g. used for latex generation)
|
82
|
+
JobData = Struct.new(:host_requirements, :inputFiles, :outputFiles, :successToken, :alwaysOverride, :moreData)
|
83
|
+
|
84
|
+
#add new software requirements here and in the list of $knownHosts below
|
85
|
+
REQUIRE_NONE = 0; REQUIRE_RUBY = 1; REQUIRE_GNUPLOT = 2; REQUIRE_MY_PROJECT = 4; REQUIRE_R = 8; REQUIRE_LATEX = 16
|
86
|
+
|
87
|
+
# These tokens should identify a given file that is NOT corrupt. That means the file could be generated
|
88
|
+
# completely. Add your own tokens here.
|
89
|
+
EPS_FILE_SUCCESS_TOKEN = "%%Trailer"
|
90
|
+
LATEX_SUCCESS_TOKEN = "end{document}"
|
91
|
+
OUTPUT_FORMAT_SUCCESS_TOKEN = "end data"
|
92
|
+
|
93
|
+
# A list of known (and allowed) hosts in your local network.
|
94
|
+
$knownHosts = [
|
95
|
+
#hostname, numCPUs, available software
|
96
|
+
["merlin", 1, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX],
|
97
|
+
["asterix", 2, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX],
|
98
|
+
["hannibal", 2, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R ],
|
99
|
+
["caesar", 2, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT |REQUIRE_LATEX],
|
100
|
+
["nero", 1, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX],
|
101
|
+
["herodot", 2, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT |REQUIRE_LATEX],
|
102
|
+
["cicero", 1, REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX],
|
103
|
+
["brutus", 1, REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX],
|
104
|
+
["homer", 2, REQUIRE_GNUPLOT| REQUIRE_R|REQUIRE_LATEX],
|
105
|
+
["platon", 1, REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX]
|
106
|
+
]
|
107
|
+
|
108
|
+
# If you want to do time sensitive analysis and hence want to use only machines of the same type
|
109
|
+
# for an experiment you could insert a new constraint, e.g. REQUIRE_TIME, that is only satisfied on
|
110
|
+
# those machines you want to run the experiment on. An alternative approach is to create an object-specific class
|
111
|
+
# for those jobs that are time sensitive:
|
112
|
+
# timeSensitiveJob = MyProjectJob.new(...)
|
113
|
+
# def timeSensitiveJob.runsOnHost(hostname)
|
114
|
+
# return hostname == "myspecialhost" && super(hostname)
|
115
|
+
# end
|
116
|
+
|
117
|
+
# hostname variable, used below. Didn't use ENV['HOSTNAME'] since this didn't always work.
|
118
|
+
$HOSTNAME = `echo $HOSTNAME`.chomp
|
119
|
+
|
120
|
+
# This subclass of Job doesn't run a job if the output files are already there and intact.
|
121
|
+
# Corrupt/incomplete files will be removed and the job will be repeated (up to 3 times).
|
122
|
+
# It also writes the job output to a log file for each job in +$workDir+.
|
123
|
+
# Furthermore the data field is expected to be of type +JobData+.
|
124
|
+
# The restriction is that jobs can only be run in the same local network where the home directory
|
125
|
+
# is common on all hosts.
|
126
|
+
class MyProjectJob < Job
|
127
|
+
# project-run could be a shell script that decides, which binary to call depending on the machine
|
128
|
+
# it has been called.
|
129
|
+
@@default_client_command = "#{$projectDir}/bin/project-run"
|
130
|
+
|
131
|
+
def initialize(name, params, data, dependencies = nil, client_command = nil)
|
132
|
+
super(:data => data,
|
133
|
+
:dependencies => dependencies,
|
134
|
+
:pre_run_handler => method(:pre_run_handler),
|
135
|
+
:post_run_handler => method(:post_run_handler),
|
136
|
+
:client_command => client_command)
|
137
|
+
@outputLogFileName = JOB_LOGGING ? "%s/job-%05d.log" % [$logDir,@number] : "/dev/null"
|
138
|
+
@name = "%05d-" % @number + name
|
139
|
+
@params = params + " 2>&1 >>#{@outputLogFileName}"
|
140
|
+
end
|
141
|
+
|
142
|
+
def pre_run_handler(job)
|
143
|
+
info = "date #{`date`.chomp}: Running job \"#{job.name}\" on #{job.host}"
|
144
|
+
puts info
|
145
|
+
if JOB_LOGGING
|
146
|
+
system("echo #{quote(info)}>>#@outputLogFileName")
|
147
|
+
system("echo \"CALL: #{job.client_command} #{job.params}\">>#@outputLogFileName")
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def post_run_handler(job)
|
152
|
+
info = "date #{`date`.chomp}: "
|
153
|
+
info += ((job.results) ? "finished" : "FAILED") +" Job \"#{job.name}\" on #{job.host}"
|
154
|
+
puts info
|
155
|
+
system("echo #{quote(info)}>>#@outputLogFileName") if JOB_LOGGING
|
156
|
+
end
|
157
|
+
|
158
|
+
def runCommand(command) # override Job.runCommand
|
159
|
+
data.successToken = [data.successToken] unless data.successToken.is_a?(Array)
|
160
|
+
data.inputFiles = [data.inputFiles] unless data.inputFiles.is_a?(Array)
|
161
|
+
data.outputFiles = [data.outputFiles] unless data.outputFiles.is_a?(Array)
|
162
|
+
|
163
|
+
## Check, if all output files already exist and are not corrupt. Abort (with success), if true
|
164
|
+
allExists = true; tokens = data.successToken.clone
|
165
|
+
if !data.alwaysOverride
|
166
|
+
for outputFile in data.outputFiles
|
167
|
+
token = tokens.shift
|
168
|
+
if File.size?(outputFile)
|
169
|
+
if data.successToken
|
170
|
+
if `grep "#{token}" '#{outputFile}'` == '' # no success?
|
171
|
+
File.delete(outputFile)
|
172
|
+
puts "Overwriting corrupt output file: #{outputFile}"
|
173
|
+
allExists = false
|
174
|
+
end
|
175
|
+
# no token? assume, the existing file is okay
|
176
|
+
end
|
177
|
+
else # no file?
|
178
|
+
if File.size?(outputFile+".gz") # but a zipped version?
|
179
|
+
# assume that the zipped file is not corrupt since a corruptency test is made
|
180
|
+
# after each run and before the file is zipped
|
181
|
+
else
|
182
|
+
allExists = false
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
return if !data.alwaysOverride and allExists and not data.outputFiles.empty? # if no output files are written, run anyway
|
188
|
+
|
189
|
+
for inputFile in data.inputFiles
|
190
|
+
if not File.size?(inputFile) # this should not happen since the dependencies should take care of this
|
191
|
+
if File.size?(inputFile+".gz")
|
192
|
+
puts "Unzipping #{inputFile}.gz ..."
|
193
|
+
system("gunzip -f \"#{inputFile}.gz\"") # uncompress zipped input data if available
|
194
|
+
# we use the server for this which is kind of dirty but this case should not happen too often
|
195
|
+
# it will be recompressed by the "Compressing data"-job
|
196
|
+
else
|
197
|
+
puts "ERROR: input file '#{inputFile}' not found for command #{command}. Job failed!"
|
198
|
+
@results = nil
|
199
|
+
return
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
data.outputFiles.each{|f| FileUtils.mkdir_p(File.dirname(f)) }
|
205
|
+
super(command) # run the command
|
206
|
+
|
207
|
+
tokens = data.successToken.clone
|
208
|
+
for outputFile in data.outputFiles
|
209
|
+
token = tokens.shift
|
210
|
+
if File.size?(outputFile)
|
211
|
+
if data.successToken
|
212
|
+
if `grep "#{token}" '#{outputFile}'` == '' # no success?
|
213
|
+
File.delete(outputFile)
|
214
|
+
info = "Deleting corrupt file: #{outputFile}"
|
215
|
+
puts info
|
216
|
+
system("echo '#{info}'>>#@outputLogFileName")
|
217
|
+
@results = nil # mark failure because file corrupt
|
218
|
+
end
|
219
|
+
# no token? assume, the existing file is okay
|
220
|
+
end
|
221
|
+
else
|
222
|
+
@results = nil # mark failure because file not found
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
# Tests, whether the host fulfills the requirements of the job.
|
228
|
+
def runsOnHost(hostname)
|
229
|
+
hostname = $HOSTNAME if hostname == "localhost"
|
230
|
+
hostInfo = $knownHosts.assoc(hostname)
|
231
|
+
return (data.host_requirements & hostInfo[2]) == data.host_requirements
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
# override JobServer.retryJob:
|
236
|
+
class JobServer
|
237
|
+
def retryJob(job)
|
238
|
+
if job.numTries < 3
|
239
|
+
puts "FAILURE: Will try to run job later: #{job}"
|
240
|
+
true
|
241
|
+
end # else return false implicitly
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
|
246
|
+
# Return a list of MyProjectJob instances that can be passed to the JobServer.
|
247
|
+
# If you want to change the design of the experiment then change it here.
|
248
|
+
def generateJobs
|
249
|
+
print "Generating jobs ... "
|
250
|
+
jobs = []
|
251
|
+
|
252
|
+
for param1 in $param1_list
|
253
|
+
for param2 in $param2_list
|
254
|
+
for instance in $instances
|
255
|
+
workdir = "#{$experimentsDir}/#{instance}/param1=#{param1},param2=#{param2}"
|
256
|
+
|
257
|
+
############################# Generate Experiments #################################
|
258
|
+
|
259
|
+
outputA = "#{workdir}/my-experiment-type-A.dat"
|
260
|
+
params = "--input Instances/#{instance} --param1 #{param1} --param2 #{param2} --output #{outputA}"
|
261
|
+
jobdata = JobData.new(REQUIRE_MY_PROJECT, [], outputA, OUTPUT_FORMAT_SUCCESS_TOKEN)
|
262
|
+
jobA = MyProjectJob.new("type A: #{instance}; #{param1}, #{param2}", params, jobdata, nil)
|
263
|
+
jobs << jobA
|
264
|
+
|
265
|
+
# this experiment depends on the previous one of type A
|
266
|
+
outputB = "#{workdir}/my-experiment-type-B.dat"
|
267
|
+
params = "--input {outputA} --param1 #{param1} --param2 #{param2} --output #{outputB}"
|
268
|
+
jobdata = JobData.new(REQUIRE_MY_PROJECT, outputA, outputB, OUTPUT_FORMAT_SUCCESS_TOKEN)
|
269
|
+
jobB = MyProjectJob.new("type B: #{param1}, #{param2}", params, jobdata, [jobA])
|
270
|
+
jobs << jobB
|
271
|
+
|
272
|
+
|
273
|
+
################################# Create plots ##################################
|
274
|
+
# Create multiplot (cost at iteration)
|
275
|
+
inputs = [outputA, outputB]
|
276
|
+
output = "#{workdir}/multiple.eps"
|
277
|
+
jobdata = JobData.new(REQUIRE_RUBY|REQUIRE_GNUPLOT, inputs, output, EPS_FILE_SUCCESS_TOKEN)
|
278
|
+
jobs << plotJob = MyProjectJob.new("Multiplot: <Title>",
|
279
|
+
"<input params> -o '#{output}'", jobdata,
|
280
|
+
[jobA, jobB], "#$projectDir/bin/multiplot_project.rb")
|
281
|
+
|
282
|
+
################################# compress data ##################################
|
283
|
+
filesToZip = [outputA, outputB]
|
284
|
+
jobdata = JobData.new(REQUIRE_NONE, [], filesToZip.map{|f|f+".gz"}, nil)
|
285
|
+
jobs << MyProjectJob.new("Compressing data", filesToZip.map{|f| "'"+f+"'"}.join(" "), jobdata,
|
286
|
+
[jobA,jobB,plotJob], "gzip")
|
287
|
+
# the files will be uncompressed when needed again (see MyProjectJob.runCommand)
|
288
|
+
|
289
|
+
|
290
|
+
############# Create Latex file with all the plots for this instance. ############
|
291
|
+
# this section has been excluded for this example
|
292
|
+
# See the latex package on www.rubyforge.org
|
293
|
+
end
|
294
|
+
end
|
295
|
+
end
|
296
|
+
puts "#{jobs.length} jobs generated."
|
297
|
+
return jobs
|
298
|
+
end # generateJobs
|
299
|
+
|
300
|
+
|
301
|
+
##################################################
|
302
|
+
################ MAIN PROGRAM ####################
|
303
|
+
##################################################
|
304
|
+
FileUtils.mkdir_p($logDir)
|
305
|
+
|
306
|
+
##### Creating the job server ##############
|
307
|
+
Thread.abort_on_exception = true # terminates the program when an exception occurred in a Thread
|
308
|
+
server = JobServer.new(generateJobs, $workDir, 0)
|
309
|
+
server.dumpStatistics(statsFilename = "#$logDir/jobserver_stats.txt",30)
|
310
|
+
puts "The server has started at #{`date`}"
|
311
|
+
puts "Waiting for workers to finish the jobs..."
|
312
|
+
puts "Look in #{statsFilename} to see the current state of the server"
|
313
|
+
|
314
|
+
for host in useHosts
|
315
|
+
hostInfo = $knownHosts.assoc(host)
|
316
|
+
unless hostInfo
|
317
|
+
raise(ArgumentError, "ERROR: unregistered host: #{host}", caller)
|
318
|
+
end
|
319
|
+
host,numCPUs,features = hostInfo
|
320
|
+
if host == $HOSTNAME or host == "localhost"
|
321
|
+
server.add_local_worker(numCPUs)
|
322
|
+
else
|
323
|
+
server.add_ssh_worker(host, $workDir, numCPUs)
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
server.serve
|
328
|
+
puts "The server has finished at #{`date`}"
|