jobserver 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +58 -0
- data/README +58 -0
- data/examples/example1.rb +19 -0
- data/examples/example2.rb +66 -0
- data/examples/example3.rb +328 -0
- data/examples/jobserver.rb +619 -0
- data/lib/jobserver.rb +619 -0
- metadata +42 -0
data/LICENSE.txt
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
Ruby is copyrighted free software by Yukihiro Matsumoto <matz@netlab.co.jp>.
|
2
|
+
You can redistribute it and/or modify it under either the terms of the GPL
|
3
|
+
(see COPYING.txt file), or the conditions below:
|
4
|
+
|
5
|
+
1. You may make and give away verbatim copies of the source form of the
|
6
|
+
software without restriction, provided that you duplicate all of the
|
7
|
+
original copyright notices and associated disclaimers.
|
8
|
+
|
9
|
+
2. You may modify your copy of the software in any way, provided that
|
10
|
+
you do at least ONE of the following:
|
11
|
+
|
12
|
+
a) place your modifications in the Public Domain or otherwise
|
13
|
+
make them Freely Available, such as by posting said
|
14
|
+
modifications to Usenet or an equivalent medium, or by allowing
|
15
|
+
the author to include your modifications in the software.
|
16
|
+
|
17
|
+
b) use the modified software only within your corporation or
|
18
|
+
organization.
|
19
|
+
|
20
|
+
c) rename any non-standard executables so the names do not conflict
|
21
|
+
with standard executables, which must also be provided.
|
22
|
+
|
23
|
+
d) make other distribution arrangements with the author.
|
24
|
+
|
25
|
+
3. You may distribute the software in object code or executable
|
26
|
+
form, provided that you do at least ONE of the following:
|
27
|
+
|
28
|
+
a) distribute the executables and library files of the software,
|
29
|
+
together with instructions (in the manual page or equivalent)
|
30
|
+
on where to get the original distribution.
|
31
|
+
|
32
|
+
b) accompany the distribution with the machine-readable source of
|
33
|
+
the software.
|
34
|
+
|
35
|
+
c) give non-standard executables non-standard names, with
|
36
|
+
instructions on where to get the original software distribution.
|
37
|
+
|
38
|
+
d) make other distribution arrangements with the author.
|
39
|
+
|
40
|
+
4. You may modify and include the part of the software into any other
|
41
|
+
software (possibly commercial). But some files in the distribution
|
42
|
+
are not written by the author, so that they are not under this terms.
|
43
|
+
|
44
|
+
They are gc.c(partly), utils.c(partly), regex.[ch], st.[ch] and some
|
45
|
+
files under the ./missing directory. See each file for the copying
|
46
|
+
condition.
|
47
|
+
|
48
|
+
5. The scripts and library files supplied as input to or produced as
|
49
|
+
output from the software do not automatically fall under the
|
50
|
+
copyright of the software, but belong to whomever generated them,
|
51
|
+
and may be sold commercially, and may be aggregated with this
|
52
|
+
software.
|
53
|
+
|
54
|
+
6. THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR
|
55
|
+
IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
|
56
|
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
57
|
+
PURPOSE.
|
58
|
+
|
data/README
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
JobServer README
|
2
|
+
================
|
3
|
+
|
4
|
+
The class JobServer supplies capabilities to execute jobs on the
|
5
|
+
local- or on remote hosts.
|
6
|
+
* Jobs encapsulate the call of a command in a shell on a (remote) host.
|
7
|
+
* Each client is controlled by a worker-thread on the server.
|
8
|
+
When a client is idle it receives the next job from the queue.
|
9
|
+
* Remote jobs will be launched using ssh. Therefore you must
|
10
|
+
configure your ssh keys without authentification password.
|
11
|
+
* A common home directory is not needed for the clients.
|
12
|
+
Different machine architectures as well as binaries are possible.
|
13
|
+
* Data for the clients can be saved to files on the client machines.
|
14
|
+
|
15
|
+
|
16
|
+
Requirements
|
17
|
+
------------
|
18
|
+
|
19
|
+
* Ruby 1.8
|
20
|
+
* ssh
|
21
|
+
|
22
|
+
Install
|
23
|
+
-------
|
24
|
+
|
25
|
+
De-compress archive and enter its top directory.
|
26
|
+
Then type:
|
27
|
+
|
28
|
+
($ su)
|
29
|
+
# ruby setup.rb
|
30
|
+
|
31
|
+
This simple step installs this program under the default
|
32
|
+
location of Ruby libraries. You can also install files into
|
33
|
+
your favorite directory by supplying setup.rb some options.
|
34
|
+
Try "ruby setup.rb --help".
|
35
|
+
|
36
|
+
|
37
|
+
Alternatively you can use the remote installer RubyGems
|
38
|
+
[http://rubygems.rubyforge.org/] for installation. Having RubyGems installed
|
39
|
+
on your system, just type:
|
40
|
+
|
41
|
+
($ su)
|
42
|
+
# gem install jobserver --remote
|
43
|
+
|
44
|
+
|
45
|
+
Usage
|
46
|
+
-----
|
47
|
+
|
48
|
+
In order to get an overview of the features you can generate
|
49
|
+
the RDoc documentation and have a look at the examples/ directory.
|
50
|
+
|
51
|
+
|
52
|
+
License
|
53
|
+
-------
|
54
|
+
|
55
|
+
Ruby License
|
56
|
+
|
57
|
+
|
58
|
+
Christian Bang, cbang AT web.de
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'jobserver'
|
2
|
+
|
3
|
+
# Create the jobs
|
4
|
+
myJobQueue = []
|
5
|
+
5.times{|i| myJobQueue << Job.new(:name=>"date job-#{i}", :client_command=> "date")}
|
6
|
+
5.times{|i| myJobQueue << Job.new(:name=>"short date job-#{i}", :client_command=> "date +%Y-%m-%d_%H.%M")}
|
7
|
+
# Create the server
|
8
|
+
server = JobServer.new(myJobQueue) #run one local worker implicitly
|
9
|
+
|
10
|
+
# You may add some remote machines to which you have ssh access with public key authentication without password.
|
11
|
+
# server.add_ssh_worker("192.168.0.1")
|
12
|
+
# server.add_ssh_worker("foo@mymachine.xy.org","",2)
|
13
|
+
|
14
|
+
# Dump out statistics on the progress.
|
15
|
+
# Default is to file "jobserver_stats.txt", every minute
|
16
|
+
server.dumpStatistics
|
17
|
+
|
18
|
+
# Wait until all jobs have finished
|
19
|
+
server.serve
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'jobserver'
|
2
|
+
|
3
|
+
# define the handlers for each job:
|
4
|
+
|
5
|
+
pre_run_handler = Proc.new do |job|
|
6
|
+
puts "Running job #{job.name} on #{job.host}"
|
7
|
+
# Initialize the results object as an empty array:
|
8
|
+
job.results = []
|
9
|
+
end
|
10
|
+
|
11
|
+
output_handler = Proc.new do |file, job|
|
12
|
+
line = file.gets
|
13
|
+
|
14
|
+
# Output the calculation, if the line contains one.
|
15
|
+
puts "Job #{job.name} has made the calculation: #{line}" if line =~ /=/
|
16
|
+
|
17
|
+
# Extract the date if the line contains one. Collect the results in the
|
18
|
+
# job.results variable.
|
19
|
+
job.results << $& if line =~ /\d?\d:\d\d:\d\d/
|
20
|
+
end
|
21
|
+
|
22
|
+
# Just another way of defining a handler. For this you must use method(:post_run_handler) later...
|
23
|
+
def post_run_handler(job)
|
24
|
+
if job.results.empty?
|
25
|
+
puts "Error executing job #{job.name} on #{job.host}.\n\t#{job}"
|
26
|
+
else
|
27
|
+
# Now that the job has finished, store the results of this job in the global
|
28
|
+
# list of results:
|
29
|
+
# You may store results e.g. in a global variable:
|
30
|
+
if $result[job.host]
|
31
|
+
$result[job.host] << job.results
|
32
|
+
else
|
33
|
+
$result[job.host] = []
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# execute the following commands for every job:
|
39
|
+
Job.default_client_command = "sleep 10;date;echo"
|
40
|
+
Job.nicelevel = nil # in this example we don't need nice
|
41
|
+
Job.verbose = 0 # print no messages about the launch of jobs
|
42
|
+
|
43
|
+
# Create the jobs
|
44
|
+
myJobQueue = []
|
45
|
+
10.times{|i| myJobQueue << Job.new(:name=>"job-#{i}", :params=>"5*#{i}=$((5*#{i}))", :pre_run_handler => pre_run_handler,
|
46
|
+
:output_handler=>output_handler, :post_run_handler=>method(:post_run_handler))}
|
47
|
+
# Create the server
|
48
|
+
server = JobServer.new(myJobQueue) #run one local worker implicitly
|
49
|
+
|
50
|
+
$result = {} # we will store results here
|
51
|
+
|
52
|
+
# You may add some remote machines to which you have ssh access with public key authentication without password.
|
53
|
+
# server.add_ssh_worker("192.168.0.1")
|
54
|
+
# server.add_ssh_worker("foo@mymachine.xy.org","",2)
|
55
|
+
|
56
|
+
# Dump out statistics on the progress.
|
57
|
+
# Default is to file "jobserver_stats.txt", every minute
|
58
|
+
server.dumpStatistics
|
59
|
+
|
60
|
+
# Wait until all jobs have finished
|
61
|
+
server.serve
|
62
|
+
|
63
|
+
puts "Times on the hosts when the jobs where run:"
|
64
|
+
for host,time in $result
|
65
|
+
puts "#{host}: #{time.join(', ')}"
|
66
|
+
end
|
@@ -0,0 +1,328 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
=begin
|
3
|
+
This non-executable example is an excerpt from a real application of the jobserver.
|
4
|
+
It was used to run experiments for combinatorial optimization problems.
|
5
|
+
This example derives a new class from Job which enhances the abilities of the jobserver by:
|
6
|
+
- robust detection of failed jobs
|
7
|
+
- jobs with various ressource-requirements will be assigned only to hosts that meet
|
8
|
+
their requirements
|
9
|
+
|
10
|
+
*Note*: This example assumes, that all hosts share a common home directory.
|
11
|
+
If you would like to have the original file on which this example is based on, write me:
|
12
|
+
Christian Bang, cbang@web.de
|
13
|
+
=end
|
14
|
+
|
15
|
+
############################# BEGIN PARAMETER SETTINGS ###################################
|
16
|
+
|
17
|
+
experiment = $*[0].to_i
|
18
|
+
# In this section you can add your own experiments. This way you can easily repeat an old experiment without having to
|
19
|
+
# change the parameters of the last experiment. The number of the experiment to run is given by command line.
|
20
|
+
case experiment # select the experiment you want to make here, or add a new one below
|
21
|
+
when 0
|
22
|
+
$stderr.puts "Please give the number of the experiment you want to run as a command line argument."
|
23
|
+
exit
|
24
|
+
when 1 ##### Experiment 1
|
25
|
+
# set variables which define the experiment
|
26
|
+
$param1_list = %w{1 3 5 7}
|
27
|
+
$param2_list = %w{xx xy yy}
|
28
|
+
$instances = %w{test01 test02}
|
29
|
+
when 2 ##### Experiment 2
|
30
|
+
# set variables which define the experiment
|
31
|
+
when 3 ##### Experiment 3
|
32
|
+
# set variables which define the experiment
|
33
|
+
else
|
34
|
+
raise "Undefined experiment"
|
35
|
+
end
|
36
|
+
puts "Running experiment ##{experiment}"
|
37
|
+
|
38
|
+
|
39
|
+
########################### now the environment dependant variables ####################
|
40
|
+
$projectDir = "#{ENV['HOME']}/project"
|
41
|
+
$experimentsDir = "#$projectDir/experiments" # where to store the experiments (in subdirectories)
|
42
|
+
$experimentName = "MY_PROJECT" # prefix for the job output filenames (if logging enabled), NOT the name of the binary
|
43
|
+
$workDir = "#$projectDir/src" # working directory for the processes
|
44
|
+
|
45
|
+
# useHosts is an array of the hosts you want to use for the experiment. If a host has multiple CPUs, all are used.
|
46
|
+
# For the definition of the hosts see the $knownHosts matrix below.
|
47
|
+
useHosts = %w{hannibal caesar homer asterix}
|
48
|
+
|
49
|
+
#JOB_LOGGING determines whether the output of each job should be kept in a log file (in $logDir)
|
50
|
+
JOB_LOGGING = false
|
51
|
+
|
52
|
+
# time when the exeperiment began. All log filenames contain this
|
53
|
+
$timestamp = `date +%Y-%m-%d_%H.%M`.chomp
|
54
|
+
|
55
|
+
$logDir = "#$experimentsDir/log.#{$experimentName}_#{$timestamp}"
|
56
|
+
|
57
|
+
############################# END PARAMETER SETTINGS ###################################
|
58
|
+
# usually you won't have to change anything below here
|
59
|
+
$LOAD_PATH << File.dirname($0)
|
60
|
+
require 'jobserver'
|
61
|
+
require 'fileutils'
|
62
|
+
|
63
|
+
# JobData has the following fields:
|
64
|
+
# :host_requirements:
|
65
|
+
# An OR concatenated list of REQUIRE_ variables like <tt>REQUIRE_RUBY|REQUIRE_GNUPLOT</tt>.
|
66
|
+
# A job is only run on a host fulfilling these constraints.
|
67
|
+
# :inputFiles:
|
68
|
+
# A string or an array of strings containing ames of files that are needed to run
|
69
|
+
# the job. If they don't exist, the job fails. But in case a gzipped input file is found,
|
70
|
+
# it is unzipped first. (gzipped again later, if another job does it)
|
71
|
+
# :outputFiles:
|
72
|
+
# A string or an array of strings containing names of files that will be created when the
|
73
|
+
# job is run. If all the files already exist and are not corrupt, the job is not run but
|
74
|
+
# marked as success. See next item for what corrupt means.
|
75
|
+
# :successToken:
|
76
|
+
# A string or an array of strings corresponding to an entry in +outputFiles+.
|
77
|
+
# If a successToken for the the output file is given and the file exists then a grep
|
78
|
+
# search for this token is made in the file. If the token was found, the file is deemed okay,
|
79
|
+
# else it is corrupt and will be deleted before execution of the job.
|
80
|
+
# :alwaysOverride: if true, existing output files will be overridden.
|
81
|
+
# :moreData: is some job-type specific data you can use for your own jobs. (E.g. used for latex generation)
|
82
|
+
JobData = Struct.new(:host_requirements, :inputFiles, :outputFiles, :successToken, :alwaysOverride, :moreData)
|
83
|
+
|
84
|
+
#add new software requirements here and in the list of $knownHosts below
|
85
|
+
REQUIRE_NONE = 0; REQUIRE_RUBY = 1; REQUIRE_GNUPLOT = 2; REQUIRE_MY_PROJECT = 4; REQUIRE_R = 8; REQUIRE_LATEX = 16
|
86
|
+
|
87
|
+
# These tokens should identify a given file that is NOT corrupt. That means the file could be generated
|
88
|
+
# completely. Add your own tokens here.
|
89
|
+
EPS_FILE_SUCCESS_TOKEN = "%%Trailer"
|
90
|
+
LATEX_SUCCESS_TOKEN = "end{document}"
|
91
|
+
OUTPUT_FORMAT_SUCCESS_TOKEN = "end data"
|
92
|
+
|
93
|
+
# A list of known (and allowed) hosts in your local network.
|
94
|
+
$knownHosts = [
|
95
|
+
#hostname, numCPUs, available software
|
96
|
+
["merlin", 1, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX],
|
97
|
+
["asterix", 2, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX],
|
98
|
+
["hannibal", 2, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R ],
|
99
|
+
["caesar", 2, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT |REQUIRE_LATEX],
|
100
|
+
["nero", 1, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX],
|
101
|
+
["herodot", 2, REQUIRE_RUBY|REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT |REQUIRE_LATEX],
|
102
|
+
["cicero", 1, REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX],
|
103
|
+
["brutus", 1, REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX],
|
104
|
+
["homer", 2, REQUIRE_GNUPLOT| REQUIRE_R|REQUIRE_LATEX],
|
105
|
+
["platon", 1, REQUIRE_GNUPLOT|REQUIRE_MY_PROJECT|REQUIRE_R|REQUIRE_LATEX]
|
106
|
+
]
|
107
|
+
|
108
|
+
# If you want to do time sensitive analysis and hence want to use only machines of the same type
|
109
|
+
# for an experiment you could insert a new constraint, e.g. REQUIRE_TIME, that is only satisfied on
|
110
|
+
# those machines you want to run the experiment on. An alternative approach is to create an object-specific class
|
111
|
+
# for those jobs that are time sensitive:
|
112
|
+
# timeSensitiveJob = MyProjectJob.new(...)
|
113
|
+
# def timeSensitiveJob.runsOnHost(hostname)
|
114
|
+
# return hostname == "myspecialhost" && super(hostname)
|
115
|
+
# end
|
116
|
+
|
117
|
+
# hostname variable, used below. Didn't use ENV['HOSTNAME'] since this didn't always work.
|
118
|
+
$HOSTNAME = `echo $HOSTNAME`.chomp
|
119
|
+
|
120
|
+
# This subclass of Job doesn't run a job if the output files are already there and intact.
|
121
|
+
# Corrupt/incomplete files will be removed and the job will be repeated (up to 3 times).
|
122
|
+
# It also writes the job output to a log file for each job in +$workDir+.
|
123
|
+
# Furthermore the data field is expected to be of type +JobData+.
|
124
|
+
# The restriction is that jobs can only be run in the same local network where the home directory
|
125
|
+
# is common on all hosts.
|
126
|
+
class MyProjectJob < Job
|
127
|
+
# project-run could be a shell script that decides, which binary to call depending on the machine
|
128
|
+
# it has been called.
|
129
|
+
@@default_client_command = "#{$projectDir}/bin/project-run"
|
130
|
+
|
131
|
+
def initialize(name, params, data, dependencies = nil, client_command = nil)
|
132
|
+
super(:data => data,
|
133
|
+
:dependencies => dependencies,
|
134
|
+
:pre_run_handler => method(:pre_run_handler),
|
135
|
+
:post_run_handler => method(:post_run_handler),
|
136
|
+
:client_command => client_command)
|
137
|
+
@outputLogFileName = JOB_LOGGING ? "%s/job-%05d.log" % [$logDir,@number] : "/dev/null"
|
138
|
+
@name = "%05d-" % @number + name
|
139
|
+
@params = params + " 2>&1 >>#{@outputLogFileName}"
|
140
|
+
end
|
141
|
+
|
142
|
+
def pre_run_handler(job)
|
143
|
+
info = "date #{`date`.chomp}: Running job \"#{job.name}\" on #{job.host}"
|
144
|
+
puts info
|
145
|
+
if JOB_LOGGING
|
146
|
+
system("echo #{quote(info)}>>#@outputLogFileName")
|
147
|
+
system("echo \"CALL: #{job.client_command} #{job.params}\">>#@outputLogFileName")
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def post_run_handler(job)
|
152
|
+
info = "date #{`date`.chomp}: "
|
153
|
+
info += ((job.results) ? "finished" : "FAILED") +" Job \"#{job.name}\" on #{job.host}"
|
154
|
+
puts info
|
155
|
+
system("echo #{quote(info)}>>#@outputLogFileName") if JOB_LOGGING
|
156
|
+
end
|
157
|
+
|
158
|
+
def runCommand(command) # override Job.runCommand
|
159
|
+
data.successToken = [data.successToken] unless data.successToken.is_a?(Array)
|
160
|
+
data.inputFiles = [data.inputFiles] unless data.inputFiles.is_a?(Array)
|
161
|
+
data.outputFiles = [data.outputFiles] unless data.outputFiles.is_a?(Array)
|
162
|
+
|
163
|
+
## Check, if all output files already exist and are not corrupt. Abort (with success), if true
|
164
|
+
allExists = true; tokens = data.successToken.clone
|
165
|
+
if !data.alwaysOverride
|
166
|
+
for outputFile in data.outputFiles
|
167
|
+
token = tokens.shift
|
168
|
+
if File.size?(outputFile)
|
169
|
+
if data.successToken
|
170
|
+
if `grep "#{token}" '#{outputFile}'` == '' # no success?
|
171
|
+
File.delete(outputFile)
|
172
|
+
puts "Overwriting corrupt output file: #{outputFile}"
|
173
|
+
allExists = false
|
174
|
+
end
|
175
|
+
# no token? assume, the existing file is okay
|
176
|
+
end
|
177
|
+
else # no file?
|
178
|
+
if File.size?(outputFile+".gz") # but a zipped version?
|
179
|
+
# assume that the zipped file is not corrupt since a corruptency test is made
|
180
|
+
# after each run and before the file is zipped
|
181
|
+
else
|
182
|
+
allExists = false
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
return if !data.alwaysOverride and allExists and not data.outputFiles.empty? # if no output files are written, run anyway
|
188
|
+
|
189
|
+
for inputFile in data.inputFiles
|
190
|
+
if not File.size?(inputFile) # this should not happen since the dependencies should take care of this
|
191
|
+
if File.size?(inputFile+".gz")
|
192
|
+
puts "Unzipping #{inputFile}.gz ..."
|
193
|
+
system("gunzip -f \"#{inputFile}.gz\"") # uncompress zipped input data if available
|
194
|
+
# we use the server for this which is kind of dirty but this case should not happen too often
|
195
|
+
# it will be recompressed by the "Compressing data"-job
|
196
|
+
else
|
197
|
+
puts "ERROR: input file '#{inputFile}' not found for command #{command}. Job failed!"
|
198
|
+
@results = nil
|
199
|
+
return
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
data.outputFiles.each{|f| FileUtils.mkdir_p(File.dirname(f)) }
|
205
|
+
super(command) # run the command
|
206
|
+
|
207
|
+
tokens = data.successToken.clone
|
208
|
+
for outputFile in data.outputFiles
|
209
|
+
token = tokens.shift
|
210
|
+
if File.size?(outputFile)
|
211
|
+
if data.successToken
|
212
|
+
if `grep "#{token}" '#{outputFile}'` == '' # no success?
|
213
|
+
File.delete(outputFile)
|
214
|
+
info = "Deleting corrupt file: #{outputFile}"
|
215
|
+
puts info
|
216
|
+
system("echo '#{info}'>>#@outputLogFileName")
|
217
|
+
@results = nil # mark failure because file corrupt
|
218
|
+
end
|
219
|
+
# no token? assume, the existing file is okay
|
220
|
+
end
|
221
|
+
else
|
222
|
+
@results = nil # mark failure because file not found
|
223
|
+
end
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
# Tests, whether the host fulfills the requirements of the job.
|
228
|
+
def runsOnHost(hostname)
|
229
|
+
hostname = $HOSTNAME if hostname == "localhost"
|
230
|
+
hostInfo = $knownHosts.assoc(hostname)
|
231
|
+
return (data.host_requirements & hostInfo[2]) == data.host_requirements
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
# override JobServer.retryJob:
|
236
|
+
class JobServer
|
237
|
+
def retryJob(job)
|
238
|
+
if job.numTries < 3
|
239
|
+
puts "FAILURE: Will try to run job later: #{job}"
|
240
|
+
true
|
241
|
+
end # else return false implicitly
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
|
246
|
+
# Return a list of MyProjectJob instances that can be passed to the JobServer.
|
247
|
+
# If you want to change the design of the experiment then change it here.
|
248
|
+
def generateJobs
|
249
|
+
print "Generating jobs ... "
|
250
|
+
jobs = []
|
251
|
+
|
252
|
+
for param1 in $param1_list
|
253
|
+
for param2 in $param2_list
|
254
|
+
for instance in $instances
|
255
|
+
workdir = "#{$experimentsDir}/#{instance}/param1=#{param1},param2=#{param2}"
|
256
|
+
|
257
|
+
############################# Generate Experiments #################################
|
258
|
+
|
259
|
+
outputA = "#{workdir}/my-experiment-type-A.dat"
|
260
|
+
params = "--input Instances/#{instance} --param1 #{param1} --param2 #{param2} --output #{outputA}"
|
261
|
+
jobdata = JobData.new(REQUIRE_MY_PROJECT, [], outputA, OUTPUT_FORMAT_SUCCESS_TOKEN)
|
262
|
+
jobA = MyProjectJob.new("type A: #{instance}; #{param1}, #{param2}", params, jobdata, nil)
|
263
|
+
jobs << jobA
|
264
|
+
|
265
|
+
# this experiment depends on the previous one of type A
|
266
|
+
outputB = "#{workdir}/my-experiment-type-B.dat"
|
267
|
+
params = "--input {outputA} --param1 #{param1} --param2 #{param2} --output #{outputB}"
|
268
|
+
jobdata = JobData.new(REQUIRE_MY_PROJECT, outputA, outputB, OUTPUT_FORMAT_SUCCESS_TOKEN)
|
269
|
+
jobB = MyProjectJob.new("type B: #{param1}, #{param2}", params, jobdata, [jobA])
|
270
|
+
jobs << jobB
|
271
|
+
|
272
|
+
|
273
|
+
################################# Create plots ##################################
|
274
|
+
# Create multiplot (cost at iteration)
|
275
|
+
inputs = [outputA, outputB]
|
276
|
+
output = "#{workdir}/multiple.eps"
|
277
|
+
jobdata = JobData.new(REQUIRE_RUBY|REQUIRE_GNUPLOT, inputs, output, EPS_FILE_SUCCESS_TOKEN)
|
278
|
+
jobs << plotJob = MyProjectJob.new("Multiplot: <Title>",
|
279
|
+
"<input params> -o '#{output}'", jobdata,
|
280
|
+
[jobA, jobB], "#$projectDir/bin/multiplot_project.rb")
|
281
|
+
|
282
|
+
################################# compress data ##################################
|
283
|
+
filesToZip = [outputA, outputB]
|
284
|
+
jobdata = JobData.new(REQUIRE_NONE, [], filesToZip.map{|f|f+".gz"}, nil)
|
285
|
+
jobs << MyProjectJob.new("Compressing data", filesToZip.map{|f| "'"+f+"'"}.join(" "), jobdata,
|
286
|
+
[jobA,jobB,plotJob], "gzip")
|
287
|
+
# the files will be uncompressed when needed again (see MyProjectJob.runCommand)
|
288
|
+
|
289
|
+
|
290
|
+
############# Create Latex file with all the plots for this instance. ############
|
291
|
+
# this section has been excluded for this example
|
292
|
+
# See the latex package on www.rubyforge.org
|
293
|
+
end
|
294
|
+
end
|
295
|
+
end
|
296
|
+
puts "#{jobs.length} jobs generated."
|
297
|
+
return jobs
|
298
|
+
end # generateJobs
|
299
|
+
|
300
|
+
|
301
|
+
##################################################
|
302
|
+
################ MAIN PROGRAM ####################
|
303
|
+
##################################################
|
304
|
+
FileUtils.mkdir_p($logDir)
|
305
|
+
|
306
|
+
##### Creating the job server ##############
|
307
|
+
Thread.abort_on_exception = true # terminates the program when an exception occurred in a Thread
|
308
|
+
server = JobServer.new(generateJobs, $workDir, 0)
|
309
|
+
server.dumpStatistics(statsFilename = "#$logDir/jobserver_stats.txt",30)
|
310
|
+
puts "The server has started at #{`date`}"
|
311
|
+
puts "Waiting for workers to finish the jobs..."
|
312
|
+
puts "Look in #{statsFilename} to see the current state of the server"
|
313
|
+
|
314
|
+
for host in useHosts
|
315
|
+
hostInfo = $knownHosts.assoc(host)
|
316
|
+
unless hostInfo
|
317
|
+
raise(ArgumentError, "ERROR: unregistered host: #{host}", caller)
|
318
|
+
end
|
319
|
+
host,numCPUs,features = hostInfo
|
320
|
+
if host == $HOSTNAME or host == "localhost"
|
321
|
+
server.add_local_worker(numCPUs)
|
322
|
+
else
|
323
|
+
server.add_ssh_worker(host, $workDir, numCPUs)
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
server.serve
|
328
|
+
puts "The server has finished at #{`date`}"
|