neptune 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,10 +1,18 @@
1
1
  # Programmer: Chris Bunch
2
2
 
3
+ # A special class of exceptions that are thrown whenever the AppController
4
+ # experiences an unexpected result.
3
5
  class AppControllerException < Exception
4
6
  end
5
7
 
8
+
9
+ # A class of exceptions that are thrown when the user tries to run a Neptune
10
+ # job but fails to give us the correct parameters to do so.
6
11
  class BadConfigurationException < Exception
7
12
  end
8
13
 
14
+
15
+ # An exception that is thrown whenever the user specifies a file to use
16
+ # that does not exist.
9
17
  class FileNotFoundException < Exception
10
18
  end
@@ -5,54 +5,81 @@ require 'app_controller_client'
5
5
  require 'common_functions'
6
6
  require 'custom_exceptions'
7
7
 
8
+
8
9
  # Setting verbose to nil here suppresses the otherwise
9
10
  # excessive SSL cert warning messages that will pollute
10
11
  # stderr and worry users unnecessarily.
11
12
  $VERBOSE = nil
12
13
 
13
- #MPI_RUN_JOB_REQUIRED = %w{ input output code filesystem }
14
- #MPI_REQUIRED = %w{ output }
15
- #X10_RUN_JOB_REQUIRED = %w{ input output code filesystem }
16
- #X10_REQUIRED = %w{ output }
17
- #DFSP_RUN_JOB_REQUIRED = %w{ output simulations }
18
- #DFSP_REQUIRED = %w{ output }
19
- #CEWSSA_RUN_JOB_REQUIRED = %w{ output simulations }
20
- #CEWSSA_REQUIRED = %w{ output }
21
- #MR_RUN_JOB_REQUIRED = %w{ }
22
- #MR_REQUIRED = %w{ output }
23
14
 
24
15
  # A list of all the Neptune job types that we support
25
16
  ALLOWED_JOB_TYPES = %w{acl cicero compile erlang mpi input output ssa babel upc x10}
26
17
 
18
+
27
19
  # The string to display for disallowed job types.
28
20
  JOB_TYPE_NOT_ALLOWED = "The job type you specified is not supported."
29
21
 
22
+
30
23
  # A list of Neptune jobs that do not require nodes to be spawned
31
24
  # up for computation
32
25
  NO_NODES_NEEDED = ["acl", "input", "output", "compile"]
33
26
 
27
+
34
28
  # A list of Neptune jobs that do not require the output to be
35
29
  # specified beforehand
36
30
  NO_OUTPUT_NEEDED = ["input"]
37
31
 
32
+
38
33
  # A list of storage mechanisms that we can use to store and retrieve
39
34
  # data to for Neptune jobs.
40
35
  ALLOWED_STORAGE_TYPES = ["appdb", "gstorage", "s3", "walrus"]
41
36
 
37
+
42
38
  # A list of jobs that require some kind of work to be done before
43
39
  # the actual computation can be performed.
44
40
  NEED_PREPROCESSING = ["babel", "compile", "erlang", "mpi", "ssa"]
45
41
 
46
- # A set of methods and constants that we've monkey-patched to enable Neptune
47
- # support. In the future, it is likely that the only exposed / monkey-patched
48
- # method should be job, while the others could probably be folded into either
49
- # a Neptune-specific class or into CommonFunctions.
50
- # TODO(cbunch): This doesn't look like it does anything - run the integration
51
- # test and confirm one way or the other.
42
+
43
+ # Since we're monkeypatching Object to add neptune() and babel(), a short
44
+ # blurb is necessary here to make rdoc happy.
52
45
  class Object
53
46
  end
54
47
 
48
+
49
+ # Make neptune() public so that babel() can call it
50
+ public
51
+
52
+
53
+ # This method is the heart of Neptune - here, we take blocks of code that the
54
+ # user has written and convert them into HPC job requests. At a high level,
55
+ # the user can request to run a job, retrieve a job's output, or modify the
56
+ # access policy (ACL) for the output of a job. By default, job data is private,
57
+ # but a Neptune job can be used to set it to public later (and vice-versa).
58
+ def neptune(params)
59
+ # Kernel.puts "Received a request to run a job."
60
+ # Kernel.puts params[:type]
61
+
62
+ job_data = NeptuneHelper.get_job_data(params)
63
+ NeptuneHelper.validate_storage_params(job_data)
64
+ # Kernel.puts "job data = #{job_data.inspect}"
65
+ keyname = job_data["@keyname"]
66
+
67
+ shadow_ip = CommonFunctions.get_from_yaml(keyname, :shadow)
68
+ secret = CommonFunctions.get_secret_key(keyname)
69
+ ssh_key = File.expand_path("~/.appscale/#{keyname}.key")
70
+ ssh_args = "-i ~/.appscale/#{keyname}.key -o StrictHostkeyChecking=no "
71
+
72
+ controller = AppControllerClient.new(shadow_ip, secret)
73
+ NeptuneHelper.do_preprocessing(job_data, controller)
74
+ return NeptuneHelper.run_job(job_data, ssh_args, shadow_ip, secret)
75
+ end
76
+
77
+
78
+ # NeptuneHelper provides methods that are used by neptune() and babel() to
79
+ # validate parameters and run the user's job.
55
80
  module NeptuneHelper
81
+
82
+
56
83
  # Certain types of jobs need steps to be taken before they
57
84
  # can be started (e.g., copying input data or code over).
58
85
  # This method dispatches the right method to use based
@@ -69,6 +96,7 @@ module NeptuneHelper
69
96
  send(preprocess, job_data, controller)
70
97
  end
71
98
 
99
+
72
100
  # This preprocessing method copies over the user's code to the
73
101
  # Shadow node so that it can be compiled there. A future version
74
102
  # of this method may also copy over libraries as well.
@@ -85,13 +113,16 @@ module NeptuneHelper
85
113
 
86
114
  ssh_args = "-i ~/.appscale/#{keyname}.key -o StrictHostkeyChecking=no root@#{shadow_ip}"
87
115
  remove_dir = "ssh #{ssh_args} 'rm -rf #{dest}' 2>&1"
88
- Kernel.puts remove_dir
116
+ # Kernel.puts remove_dir
89
117
  CommonFunctions.shell(remove_dir)
90
118
  CommonFunctions.scp_to_shadow(code, dest, keyname, is_dir=true)
91
119
 
92
120
  job_data["@code"] = dest
93
121
  end
94
122
 
123
+
124
+ # This preprocessing method makes sure that the user's Erlang code exists
125
+ # and copies it over to the AppScale Shadow node.
95
126
  def self.preprocess_erlang(job_data, controller)
96
127
  self.require_param("@code", job_data)
97
128
 
@@ -106,6 +137,7 @@ module NeptuneHelper
106
137
  CommonFunctions.scp_to_shadow(source_code, dest_code, keyname)
107
138
  end
108
139
 
140
+
109
141
  # This preprocessing method verifies that the user specified the number of nodes
110
142
  # to use. If they also specified the number of processes to use, we also verify
111
143
  # that this value is at least as many as the number of nodes (that is, nodes
@@ -138,6 +170,7 @@ module NeptuneHelper
138
170
  return job_data
139
171
  end
140
172
 
173
+
141
174
  # This preprocessing method verifies that the user specified the number of
142
175
  # trajectories to run, via either :trajectories or :simulations. Both should
143
176
  # not be specified - only one or the other, and regardless of which they
@@ -157,12 +190,18 @@ module NeptuneHelper
157
190
  return job_data
158
191
  end
159
192
 
193
+
194
+ # This helper method aborts if the given parameter is not present in the
195
+ # job data provided.
160
196
  def self.require_param(param, job_data)
161
197
  if !job_data[param]
162
198
  raise BadConfigurationException.new("#{param} must be specified")
163
199
  end
164
200
  end
165
201
 
202
+
203
+ # This helper method asks the AppController if the named file exists,
204
+ # and if it does not, throws an exception.
166
205
  def self.require_file_to_exist(file, job_data, controller)
167
206
  if controller.does_file_exist?(file, job_data)
168
207
  return
@@ -171,6 +210,9 @@ module NeptuneHelper
171
210
  end
172
211
  end
173
212
 
213
+
214
+ # This helper method performs the opposite function of require_file_to_exist,
215
+ # raising an exception if the named file does exist.
174
216
  def self.require_file_to_not_exist(file, job_data, controller)
175
217
  begin
176
218
  self.require_file_to_exist(file, job_data, controller)
@@ -181,6 +223,7 @@ module NeptuneHelper
181
223
  end
182
224
  end
183
225
 
226
+
184
227
  # This preprocessing method verifies that the user specified code that
185
228
  # should be run, where the output should be placed, and an engine to run over.
186
229
  # It also verifies that all files to be used are actually reachable.
@@ -229,6 +272,10 @@ module NeptuneHelper
229
272
  end
230
273
  end
231
274
 
275
+
276
+ # This method takes in a hash in the format that users write neptune/babel
277
+ # jobs in {:a => "b"} and converts it to the legacy format that Neptune
278
+ # used to use {"@a" => "b"}, and is understood by the AppController.
232
279
  def self.get_job_data(params)
233
280
  job_data = {}
234
281
  params.each { |k, v|
@@ -277,6 +324,11 @@ module NeptuneHelper
277
324
  return job_data
278
325
  end
279
326
 
327
+
328
+ # This method looks through the given job data and makes sure that the correct
329
+ # parameters are present for the storage mechanism specified. It throws an
330
+ # exception if there are errors in the job data or if a needed parameter is
331
+ # missing.
280
332
  def self.validate_storage_params(job_data)
281
333
  job_data["@storage"] ||= "appdb"
282
334
 
@@ -297,10 +349,10 @@ module NeptuneHelper
297
349
  if storage == "s3"
298
350
  ["EC2_ACCESS_KEY", "EC2_SECRET_KEY", "S3_URL"].each { |item|
299
351
  if job_data["@#{item}"]
300
- Kernel.puts "Using specified #{item}"
352
+ # Kernel.puts "Using specified #{item}"
301
353
  else
302
354
  if ENV[item]
303
- Kernel.puts "Using #{item} from environment"
355
+ # Kernel.puts "Using #{item} from environment"
304
356
  job_data["@#{item}"] = ENV[item]
305
357
  else
306
358
  raise BadConfigurationException.new("When storing data to S3, #{item} must be specified or be in " +
@@ -313,6 +365,7 @@ module NeptuneHelper
313
365
  return job_data
314
366
  end
315
367
 
368
+
316
369
  # This method takes a file on the local user's computer and stores it remotely
317
370
  # via AppScale. It returns a hash map indicating whether or not the job
318
371
  # succeeded and if it failed, the reason for it.
@@ -330,11 +383,11 @@ module NeptuneHelper
330
383
 
331
384
  remote = "/tmp/neptune-input-#{rand(100000)}"
332
385
  scp_cmd = "scp -r #{ssh_args} #{local_file} root@#{shadow_ip}:#{remote}"
333
- Kernel.puts scp_cmd
386
+ # Kernel.puts scp_cmd
334
387
  CommonFunctions.shell(scp_cmd)
335
388
 
336
389
  job_data["@local"] = remote
337
- Kernel.puts "job data = #{job_data.inspect}"
390
+ # Kernel.puts "job data = #{job_data.inspect}"
338
391
  response = controller.put_input(job_data)
339
392
  if response
340
393
  return {:result => :success}
@@ -344,24 +397,26 @@ module NeptuneHelper
344
397
  end
345
398
  end
346
399
 
400
+
347
401
  # This method waits for AppScale to finish compiling the user's code, indicated
348
402
  # by AppScale copying the finished code to a pre-determined location.
349
403
  def self.wait_for_compilation_to_finish(ssh_args, shadow_ip, compiled_location)
350
404
  loop {
351
405
  ssh_command = "ssh #{ssh_args} root@#{shadow_ip} 'ls #{compiled_location}' 2>&1"
352
- Kernel.puts ssh_command
406
+ # Kernel.puts ssh_command
353
407
  ssh_result = CommonFunctions.shell(ssh_command)
354
- Kernel.puts "result was [#{ssh_result}]"
408
+ # Kernel.puts "result was [#{ssh_result}]"
355
409
  if ssh_result =~ /No such file or directory/
356
- Kernel.puts "Still waiting for code to be compiled..."
410
+ # Kernel.puts "Still waiting for code to be compiled..."
357
411
  else
358
- Kernel.puts "compilation complete! Copying compiled code to #{copy_to}"
412
+ # Kernel.puts "compilation complete! Copying compiled code to #{copy_to}"
359
413
  return
360
414
  end
361
415
  sleep(5)
362
416
  }
363
417
  end
364
418
 
419
+
365
420
  # This method sends out a request to compile code, waits for it to finish, and
366
421
  # gets the standard out and error returned from the compilation. This method
367
422
  # returns a hash containing the standard out, error, and a result that indicates
@@ -374,7 +429,7 @@ module NeptuneHelper
374
429
  FileUtils.rm_rf(copy_to)
375
430
 
376
431
  scp_command = "scp -r #{ssh_args} root@#{shadow_ip}:#{compiled_location} #{copy_to} 2>&1"
377
- Kernel.puts scp_command
432
+ # Kernel.puts scp_command
378
433
  CommonFunctions.shell(scp_command)
379
434
 
380
435
  code = job_data["@code"]
@@ -383,13 +438,14 @@ module NeptuneHelper
383
438
 
384
439
  [remote_dir, compiled_location].each { |remote_files|
385
440
  ssh_command = "ssh #{ssh_args} root@#{shadow_ip} 'rm -rf #{remote_files}' 2>&1"
386
- Kernel.puts ssh_command
441
+ # Kernel.puts ssh_command
387
442
  CommonFunctions.shell(ssh_command)
388
443
  }
389
444
 
390
445
  return get_std_out_and_err(copy_to)
391
446
  end
392
447
 
448
+
393
449
  # This method returns a hash containing the standard out and standard error
394
450
  # from a completed job, as well as a result field that indicates whether or
395
451
  # not the job completed successfully (success = no errors).
@@ -411,9 +467,12 @@ module NeptuneHelper
411
467
  return result
412
468
  end
413
469
 
470
+
471
+ # This method uploads a Google App Engine application into AppScale, for use
472
+ # with Cicero jobs. It requires the AppScale tools to be installed.
414
473
  def self.upload_app_for_cicero(job_data)
415
474
  if !job_data["@app"]
416
- Kernel.puts "No app specified, not uploading..."
475
+ # Kernel.puts "No app specified, not uploading..."
417
476
  return
418
477
  end
419
478
 
@@ -431,12 +490,13 @@ module NeptuneHelper
431
490
  upload_app = "appscale-upload-app"
432
491
  end
433
492
 
434
- Kernel.puts "Uploading AppEngine app at #{app_location}"
493
+ # Kernel.puts "Uploading AppEngine app at #{app_location}"
435
494
  upload_command = "#{upload_app} --file #{app_location} --test --keyname #{keyname}"
436
- Kernel.puts upload_command
437
- Kernel.puts `#{upload_command}`
495
+ # Kernel.puts upload_command
496
+ # Kernel.puts `#{upload_command}`
438
497
  end
439
498
 
499
+
440
500
  # This method actually runs the Neptune job, given information about the job
441
501
  # as well as information about the node to send the request to.
442
502
  def self.run_job(job_data, ssh_args, shadow_ip, secret)
@@ -473,30 +533,3 @@ module NeptuneHelper
473
533
  return result
474
534
  end
475
535
  end
476
-
477
- # Make neptune() public so that babel() can call it
478
- public
479
-
480
- # This method is the heart of Neptune - here, we take blocks of code that the
481
- # user has written and convert them into HPC job requests. At a high level,
482
- # the user can request to run a job, retrieve a job's output, or modify the
483
- # access policy (ACL) for the output of a job. By default, job data is private,
484
- # but a Neptune job can be used to set it to public later (and vice-versa).
485
- def neptune(params)
486
- Kernel.puts "Received a request to run a job."
487
- Kernel.puts params[:type]
488
-
489
- job_data = NeptuneHelper.get_job_data(params)
490
- NeptuneHelper.validate_storage_params(job_data)
491
- Kernel.puts "job data = #{job_data.inspect}"
492
- keyname = job_data["@keyname"]
493
-
494
- shadow_ip = CommonFunctions.get_from_yaml(keyname, :shadow)
495
- secret = CommonFunctions.get_secret_key(keyname)
496
- ssh_key = File.expand_path("~/.appscale/#{keyname}.key")
497
- ssh_args = "-i ~/.appscale/#{keyname}.key -o StrictHostkeyChecking=no "
498
-
499
- controller = AppControllerClient.new(shadow_ip, secret)
500
- NeptuneHelper.do_preprocessing(job_data, controller)
501
- return NeptuneHelper.run_job(job_data, ssh_args, shadow_ip, secret)
502
- end
@@ -7,6 +7,58 @@ require 'test/unit'
7
7
 
8
8
 
9
9
  class TestBabel < Test::Unit::TestCase
10
+ def test_babel_mpi_job
11
+ keyname = "appscale"
12
+ file = "/bucket/file.py"
13
+ params = { :type => "mpi",
14
+ :code => file,
15
+ :executable => 'python',
16
+ :procs_to_use => 1,
17
+ :nodes_to_use => 1,
18
+ :storage => "s3",
19
+ :EC2_ACCESS_KEY => "boo",
20
+ :EC2_SECRET_KEY => "baz",
21
+ :S3_URL => "http://baz.com",
22
+ :keyname => keyname
23
+ }
24
+
25
+ job_data = {}
26
+ params.each { |k, v|
27
+ job_data["@#{k}"] = v
28
+ }
29
+ job_data["@is_remote"] = true
30
+
31
+ output = "/bucket/babel/temp-0123456789"
32
+ job_data["@output"] = output
33
+
34
+ run_job_data = job_data.dup
35
+ run_job_data["@engine"] = "executor-sqs"
36
+ run_job_data["@run_local"] = true
37
+
38
+ output_job_data = job_data.dup
39
+ output_job_data["@type"] = "output"
40
+
41
+ kernel = flexmock(Kernel)
42
+ kernel.should_receive(:puts).and_return()
43
+ kernel.should_receive(:rand).and_return(0,1,2,3,4,5,6,7,8,9)
44
+ kernel.should_receive(:sleep).and_return()
45
+
46
+ flexmock(AppControllerClient).new_instances { |instance|
47
+ instance.should_receive(:does_file_exist?).with(file, job_data).and_return(true)
48
+ instance.should_receive(:does_file_exist?).with(output, job_data).and_return(false)
49
+
50
+ instance.should_receive(:start_neptune_job).with(run_job_data).and_return("MPI job is now running")
51
+
52
+ instance.should_receive(:get_output).with(output_job_data).and_return("output")
53
+ }
54
+
55
+ commonfunctions = flexmock(CommonFunctions)
56
+ commonfunctions.should_receive(:get_from_yaml).with(keyname, :shadow).and_return("127.0.0.1")
57
+ commonfunctions.should_receive(:get_secret_key).with(keyname).and_return("secret")
58
+
59
+ assert_equal("output", babel(params))
60
+ end
61
+
10
62
  def test_bad_babel_params
11
63
  job_data_no_code_param = {}
12
64
  assert_raise(BadConfigurationException) {
@@ -42,15 +94,17 @@ class TestBabel < Test::Unit::TestCase
42
94
  ENV['BABEL_BUCKET_NAME'] = "/baz"
43
95
  actual_local_2 = BabelHelper.generate_output_location(job_data_local_code)
44
96
  assert_equal(expected_local, actual_local_2)
97
+ ENV['BABEL_BUCKET_NAME'] = nil
45
98
 
46
99
  # Not putting the initial slash on the bucket name should be fine too.
47
100
  ENV['BABEL_BUCKET_NAME'] = "baz"
48
101
  actual_local_3 = BabelHelper.generate_output_location(job_data_local_code)
49
102
  assert_equal(expected_local, actual_local_3)
103
+ ENV['BABEL_BUCKET_NAME'] = nil
50
104
 
51
105
  # Finally, if we run a job and specify remote code, that should be used
52
106
  # as the bucket.
53
- job_data_remote_code = {"@code" => "/baz/boo/code.baz", "@is_remote" => true}
107
+ job_data_remote_code = {"@code" => "/baz/boo/code.baz", "@storage" => "s3"}
54
108
  expected_remote = "/baz/babel/temp-10"
55
109
 
56
110
  actual_remote = BabelHelper.generate_output_location(job_data_remote_code)
@@ -108,7 +162,8 @@ class TestBabel < Test::Unit::TestCase
108
162
  assert_equal(expected, actual_3)
109
163
  end
110
164
 
111
- def test_run_job
165
+ def test_run_babel_job
166
+ # Running a job with no @type specified means it should be a Babel job
112
167
  job_data = {
113
168
  "@code" => "/baz/boo/code.baz",
114
169
  "@argv" => ["boo", "/remote/babel/baz", "gbaz"]
@@ -131,6 +186,31 @@ class TestBabel < Test::Unit::TestCase
131
186
  assert_equal(expected, actual)
132
187
  end
133
188
 
189
+ def test_run_mpi_job
190
+ # Running a job with @type specified should preserve the job type
191
+ job_data = {
192
+ "@type" => "mpi",
193
+ "@code" => "/baz/boo/code.baz",
194
+ "@argv" => ["boo", "/remote/babel/baz", "gbaz"]
195
+ }
196
+
197
+ neptune_params = {
198
+ :type => "mpi",
199
+ :code => "/baz/boo/code.baz",
200
+ :argv => ["boo", "/remote/babel/baz", "gbaz"],
201
+ :run_local => true,
202
+ :engine => "executor-sqs"
203
+ }
204
+
205
+ result = { :result => :success }
206
+ kernel = flexmock(Kernel)
207
+ kernel.should_receive(:neptune).with(neptune_params).and_return(result)
208
+
209
+ expected = :success
210
+ actual = BabelHelper.run_job(job_data)[:result]
211
+ assert_equal(expected, actual)
212
+ end
213
+
134
214
  def test_get_output
135
215
  job_data = {
136
216
  "@output" => "/baz/boo/code.baz"