elastic-mapreduce 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. data/CHANGELOG +51 -0
  2. data/Gemfile +13 -0
  3. data/Gemfile.lock +16 -0
  4. data/LICENSE.txt +393 -0
  5. data/NOTICE.txt +26 -0
  6. data/README +1007 -0
  7. data/Rakefile +35 -0
  8. data/VERSION +1 -0
  9. data/bin/elastic-mapreduce +27 -0
  10. data/cacert.pem +280 -0
  11. data/elastic-mapreduce.gemspec +104 -0
  12. data/lib/amazon/aws/exceptions.rb +211 -0
  13. data/lib/amazon/coral/awsquery.rb +128 -0
  14. data/lib/amazon/coral/awsquerychainhelper.rb +92 -0
  15. data/lib/amazon/coral/awsqueryhandler.rb +170 -0
  16. data/lib/amazon/coral/awsqueryurihandler.rb +34 -0
  17. data/lib/amazon/coral/call.rb +68 -0
  18. data/lib/amazon/coral/dispatcher.rb +33 -0
  19. data/lib/amazon/coral/ec2client.rb +91 -0
  20. data/lib/amazon/coral/elasticmapreduceclient.rb +198 -0
  21. data/lib/amazon/coral/handler.rb +20 -0
  22. data/lib/amazon/coral/httpdelegationhelper.rb +27 -0
  23. data/lib/amazon/coral/httpdestinationhandler.rb +36 -0
  24. data/lib/amazon/coral/httphandler.rb +124 -0
  25. data/lib/amazon/coral/identityhandler.rb +32 -0
  26. data/lib/amazon/coral/job.rb +25 -0
  27. data/lib/amazon/coral/logfactory.rb +35 -0
  28. data/lib/amazon/coral/option.rb +70 -0
  29. data/lib/amazon/coral/orchestrator.rb +49 -0
  30. data/lib/amazon/coral/querystringmap.rb +93 -0
  31. data/lib/amazon/coral/service.rb +130 -0
  32. data/lib/amazon/coral/simplelog.rb +98 -0
  33. data/lib/amazon/coral/urlencoding.rb +19 -0
  34. data/lib/amazon/coral/v0signaturehandler.rb +33 -0
  35. data/lib/amazon/coral/v0signaturehelper.rb +83 -0
  36. data/lib/amazon/coral/v1signaturehandler.rb +32 -0
  37. data/lib/amazon/coral/v1signaturehelper.rb +58 -0
  38. data/lib/amazon/coral/v2signaturehandler.rb +46 -0
  39. data/lib/amazon/coral/v2signaturehelper.rb +76 -0
  40. data/lib/amazon/retry_delegator.rb +66 -0
  41. data/lib/amazon/stderr_logger.rb +23 -0
  42. data/lib/client.rb +117 -0
  43. data/lib/commands.rb +1690 -0
  44. data/lib/credentials.rb +86 -0
  45. data/lib/ec2_client_wrapper.rb +73 -0
  46. data/lib/json/lexer.rb +294 -0
  47. data/lib/json/objects.rb +200 -0
  48. data/lib/json.rb +58 -0
  49. data/lib/simple_executor.rb +11 -0
  50. data/lib/simple_logger.rb +38 -0
  51. data/lib/uuidtools/version.rb +32 -0
  52. data/lib/uuidtools.rb +655 -0
  53. data/run_tests.rb +8 -0
  54. data/samples/freebase/code/freebase_jobflow.json +44 -0
  55. data/samples/similarity/lastfm_jobflow.json +78 -0
  56. data/samples/wordSplitter.py +18 -0
  57. data/tests/commands_test.rb +587 -0
  58. data/tests/credentials.json +7 -0
  59. data/tests/example.json +14 -0
  60. metadata +154 -0
data/lib/commands.rb ADDED
@@ -0,0 +1,1690 @@
1
+ #
2
+ # Copyright 2008-2010 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
+
4
+ require 'set'
5
+ require 'credentials'
6
+ require 'optparse'
7
+ require 'client'
8
+ require 'ec2_client_wrapper'
9
+ require 'open3'
10
+
11
+ module Commands
12
+
13
+ ELASTIC_MAPREDUCE_CLIENT_VERSION = "2010-11-11"
14
+
15
+ class Commands
16
+ attr_accessor :opts, :global_options, :commands, :logger, :executor
17
+
18
+ def initialize(logger, executor)
19
+ @commands = []
20
+ @opts = nil
21
+ @global_options = {
22
+ :jobflow => []
23
+ }
24
+ @logger = logger
25
+ @executor = executor
26
+ end
27
+
28
+ def last
29
+ @commands.last
30
+ end
31
+
32
+ def <<(value)
33
+ @commands << value
34
+ end
35
+
36
+ def size
37
+ @commands.size
38
+ end
39
+
40
+ def validate
41
+ @commands.each { |x| x.validate }
42
+ end
43
+
44
+ def enact(client)
45
+ @commands.each { |x| x.enact(client) }
46
+ end
47
+
48
+ def each(&block)
49
+ @commands.each(&block)
50
+ end
51
+
52
+ def parse_command(klass, name, description)
53
+ @opts.on(name, description) do |arg|
54
+ self << klass.new(name, description, arg, self)
55
+ end
56
+ end
57
+
58
+ def parse_option(klass, name, description, parent_commands, *args)
59
+ @opts.on(name, description) do |arg|
60
+ klass.new(name, description, arg, parent_commands, self, *args).attach(commands)
61
+ end
62
+ end
63
+
64
+ def parse_options(parent_commands, options)
65
+ for option in options do
66
+ klass, name, description = option[0..2]
67
+ args = option[3..-1]
68
+ self.parse_option(klass, name, description, parent_commands, *args)
69
+ end
70
+ end
71
+
72
+ def parse_jobflows(args)
73
+ for arg in args do
74
+ if arg =~ /^j-\w{5,20}$/ then
75
+ @global_options[:jobflow] << arg
76
+ end
77
+ end
78
+ end
79
+
80
+ def have(field_symbol)
81
+ return @global_options[field_symbol] != nil
82
+ end
83
+
84
+ def get_field(field_symbol, default_value=nil)
85
+ value = @global_options[field_symbol]
86
+ if ( value == nil ) then
87
+ return default_value
88
+ else
89
+ return value
90
+ end
91
+ end
92
+
93
+ def exec(cmd)
94
+ @executor.exec(cmd)
95
+ end
96
+ end
97
+
98
+ class Command
99
+ attr_accessor :name, :description, :arg, :commands, :logger
100
+
101
+ def initialize(name, description, arg, commands)
102
+ @name = name
103
+ @description = description
104
+ @arg = arg
105
+ @commands = commands
106
+ @logger = commands.logger
107
+ end
108
+
109
+ # test any constraints that the command has
110
+ def validate
111
+ end
112
+
113
+ # action the command
114
+ def enact(client)
115
+ end
116
+
117
+ def option(argument_name, argument_symbol, value)
118
+ var = self.send(argument_symbol)
119
+ if var == nil then
120
+ self.send((argument_symbol.to_s + "=").to_sym, value)
121
+ elsif var.is_a?(Array) then
122
+ var << value
123
+ else
124
+ raise RuntimeError, "Repeating #{argument_name} is not allowed, previous value was #{var.inspect}"
125
+ end
126
+ end
127
+
128
+ def get_field(field_symbol, default_value=nil)
129
+ value = nil
130
+ if respond_to?(field_symbol) then
131
+ value = self.send(field_symbol)
132
+ end
133
+ if value == nil then
134
+ value = @commands.global_options[field_symbol]
135
+ end
136
+ default_field_symbol = ("default_" + field_symbol.to_s).to_sym
137
+ if value == nil && respond_to?(default_field_symbol) then
138
+ value = self.send(default_field_symbol)
139
+ end
140
+ if value == nil then
141
+ value = default_value
142
+ end
143
+ return value
144
+ end
145
+
146
+ def require(field_symbol, error_msg)
147
+ value = get_field(field_symbol)
148
+ if value == nil then
149
+ raise RuntimeError, error_msg
150
+ end
151
+ return value
152
+ end
153
+
154
+ def have(field_symbol)
155
+ value = get_field(field_symbol)
156
+ return value != nil
157
+ end
158
+
159
+ def has_value(obj, *args)
160
+ while obj != nil && args.size > 1 do
161
+ obj = obj[args.shift]
162
+ end
163
+ return obj == args[0]
164
+ end
165
+
166
+ def resolve(obj, *args)
167
+ while obj != nil && args.size > 0 do
168
+ obj = obj[args.shift]
169
+ end
170
+ return obj
171
+ end
172
+
173
+ def require_single_jobflow
174
+ jobflow_ids = get_field(:jobflow)
175
+ if jobflow_ids.size == 0 then
176
+ raise RuntimeError, "A jobflow is required to use option #{name}"
177
+ elsif jobflow_ids.size > 1 then
178
+ raise RuntimeError, "The option #{name} can only act on a single jobflow"
179
+ end
180
+ return jobflow_ids.first
181
+ end
182
+
183
+ end
184
+
185
+ class CommandOption
186
+ attr_accessor :name, :description, :arg, :parent_commands, :commands
187
+
188
+ def initialize(name, description, arg, parent_commands, commands, field_symbol=nil, pattern=nil)
189
+ @name = name
190
+ @description = description
191
+ @arg = arg
192
+ @parent_commands = parent_commands
193
+ @commands = commands
194
+ @field_symbol = field_symbol
195
+ @pattern = pattern
196
+ end
197
+
198
+ def attach(commands)
199
+ for command in commands.reverse do
200
+ command_name = command.name.split(/\s+/).first
201
+ if @parent_commands.include?(command_name) || @parent_commands.include?(command.class) then
202
+ return command
203
+ end
204
+ end
205
+ raise RuntimeError, "Expected argument #{name} to follow one of #{parent_commands.join(", ")}"
206
+ end
207
+ end
208
+
209
+ class StepCommand < Command
210
+ attr_accessor :args, :step_name, :step_action, :apps_path, :beta_path
211
+ attr_accessor :script_runner_path, :pig_path, :hive_path, :pig_cmd, :hive_cmd, :enable_debugging_path
212
+
213
+ def initialize(*args)
214
+ super(*args)
215
+ @args = []
216
+ end
217
+
218
+ def default_script_runner_path
219
+ File.join(get_field(:apps_path), "libs/script-runner/script-runner.jar")
220
+ end
221
+
222
+ def default_pig_path
223
+ File.join(get_field(:apps_path), "libs/pig/")
224
+ end
225
+
226
+ def default_pig_cmd
227
+ [ File.join(get_field(:pig_path), "pig-script"), "--base-path",
228
+ get_field(:pig_path) ]
229
+ end
230
+
231
+ def default_hive_path
232
+ File.join(get_field(:apps_path), "libs/hive/")
233
+ end
234
+
235
+ def default_hive_cmd
236
+ [ File.join(get_field(:hive_path), "hive-script"), "--base-path",
237
+ get_field(:hive_path) ]
238
+ end
239
+
240
+ def default_resize_jobflow_cmd
241
+ File.join(get_field(:apps_path), "libs/resize-job-flow/0.1/resize-job-flow.jar")
242
+ end
243
+
244
+ def default_enable_debugging_path
245
+ File.join(get_field(:apps_path), "libs/state-pusher/0.1")
246
+ end
247
+
248
+ def validate
249
+ super
250
+ require(:apps_path, "--apps-path path must be defined")
251
+ end
252
+
253
+ def script_args
254
+ if @arg then
255
+ [ @arg ] + @args
256
+ else
257
+ @args
258
+ end
259
+ end
260
+
261
+ def extra_args
262
+ if @args != nil && @args.size > 0 then
263
+ return ["--args"] + @args
264
+ else
265
+ return []
266
+ end
267
+ end
268
+
269
+ def ensure_install_cmd(jobflow, sc, install_step_class)
270
+ has_install = false
271
+ install_step = install_step_class.new_from_commands(commands, self)
272
+ if install_step.jobflow_has_install_step(jobflow) then
273
+ return sc
274
+ else
275
+ new_sc = []
276
+ has_install_pi = false
277
+ for sc_cmd in sc do
278
+ if sc_cmd.is_a?(install_step_class) then
279
+ if has_install_pi then
280
+ next
281
+ else
282
+ has_install_pi = true
283
+ end
284
+ end
285
+ if sc_cmd.is_a?(self.class) then
286
+ if ! has_install_pi then
287
+ has_install_pi = true
288
+ new_sc << install_step
289
+ install_step.validate
290
+ end
291
+ end
292
+ new_sc << sc_cmd
293
+ end
294
+ end
295
+ return new_sc
296
+ end
297
+
298
+ def reorder_steps(jobflow, sc)
299
+ return sc
300
+ end
301
+ end
302
+
303
+ class ResizeJobflowCommand < StepCommand
304
+ def validate
305
+ super
306
+ end
307
+
308
+ def steps
309
+ step = {
310
+ "Name" => get_field(:step_name, "Resize Job Flow Command"),
311
+ "ActionOnFailure" => get_field(:step_action, "CANCEL_AND_WAIT"),
312
+ "HadoopJarStep" => {
313
+ "Jar" => get_field(:resize_jobflow_cmd),
314
+ "Args" => @args
315
+ }
316
+ }
317
+ return [ step ]
318
+ end
319
+
320
+ end
321
+
322
+ class EnableDebuggingCommand < StepCommand
323
+ def steps
324
+ step = {
325
+ "Name" => get_field(:step_name, "Setup Hadoop Debugging"),
326
+ "ActionOnFailure" => get_field(:step_action, "TERMINATE_JOB_FLOW"),
327
+ "HadoopJarStep" => {
328
+ "Jar" => get_field(:script_runner_path),
329
+ "Args" => [ File.join(get_field(:enable_debugging_path), "fetch") ]
330
+ }
331
+ }
332
+ return [ step ]
333
+ end
334
+
335
+ def reorder_steps(jobflow, sc)
336
+ # remove enable debugging steps and add self at start
337
+ new_sc = []
338
+ for step_cmd in sc do
339
+ if ! step_cmd.is_a?(EnableDebuggingCommand) then
340
+ new_sc << step_cmd
341
+ end
342
+ end
343
+ return [ self ] + new_sc
344
+ end
345
+ end
346
+
347
+ class PigScriptCommand < StepCommand
348
+ def steps
349
+ mandatory_args = [ "--run-pig-script", "--args", "-f" ]
350
+ if @arg then
351
+ mandatory_args << @arg
352
+ end
353
+ step = {
354
+ "Name" => get_field(:step_name, "Run Pig Script"),
355
+ "ActionOnFailure" => get_field(:step_action, "CANCEL_AND_WAIT"),
356
+ "HadoopJarStep" => {
357
+ "Jar" => get_field(:script_runner_path),
358
+ "Args" => get_field(:pig_cmd) + mandatory_args + @args
359
+ }
360
+ }
361
+ return [ step ]
362
+ end
363
+
364
+
365
+ def reorder_steps(jobflow, sc)
366
+ return ensure_install_cmd(jobflow, sc, PigInteractiveCommand)
367
+ end
368
+ end
369
+
370
+ class PigInteractiveCommand < StepCommand
371
+ def self.new_from_commands(commands, parent)
372
+ sc = self.new("--pig-interactive", "Run a jobflow with Pig Installed", nil, commands)
373
+ sc.step_action = parent.step_action
374
+ return sc
375
+ end
376
+
377
+ def steps
378
+ step = {
379
+ "Name" => get_field(:step_name, "Setup Pig"),
380
+ "ActionOnFailure" => get_field(:step_action, "TERMINATE_JOB_FLOW"),
381
+ "HadoopJarStep" => {
382
+ "Jar" => get_field(:script_runner_path),
383
+ "Args" => get_field(:pig_cmd) + ["--install-pig"] + extra_args
384
+ }
385
+ }
386
+ return [ step ]
387
+ end
388
+
389
+ def jobflow_has_install_step(jobflow)
390
+ install_steps = jobflow['Steps'].select do |step|
391
+ step["ExecutionStatusDetail"]["State"] != "FAILED" &&
392
+ has_value(step, 'StepConfig', 'HadoopJarStep', 'Jar', get_field(:script_runner_path)) &&
393
+ has_value(step, 'StepConfig', 'HadoopJarStep', 'Args', 3, "--install-pig")
394
+ end
395
+ return install_steps.size > 0
396
+ end
397
+ end
398
+
399
+ class HiveCommand < StepCommand
400
+ attr_accessor :hive_versions
401
+
402
+ def get_version_args(require_single_version)
403
+ versions = get_field(:hive_versions, nil)
404
+ if versions == nil then
405
+ return []
406
+ end
407
+ if require_single_version then
408
+ if versions.split(",").size != 1 then
409
+ raise RuntimeError, "Only one version my be specified for --hive-script"
410
+ end
411
+ end
412
+ return ["--hive-versions", versions]
413
+ end
414
+
415
+ end
416
+
417
+ class HiveSiteCommand < HiveCommand
418
+
419
+ def steps
420
+ step = {
421
+ "Name" => get_field(:step_name, "Install Hive Site Configuration"),
422
+ "ActionOnFailure" => get_field(:step_action, "CANCEL_AND_WAIT"),
423
+ "HadoopJarStep" => {
424
+ "Jar" => get_field(:script_runner_path),
425
+ "Args" => get_field(:hive_cmd) + [ "--install-hive-site", "--hive-site=#{@arg}" ] +
426
+ extra_args + get_version_args(true)
427
+ }
428
+ }
429
+ return [ step ]
430
+ end
431
+
432
+ def reorder_steps(jobflow, sc)
433
+ return ensure_install_cmd(jobflow, sc, HiveInteractiveCommand)
434
+ end
435
+ end
436
+
437
+ class HiveScriptCommand < HiveCommand
438
+
439
+ def steps
440
+ mandatory_args = [ "--run-hive-script", "--args", "-f" ]
441
+ if @arg then
442
+ mandatory_args << @arg
443
+ end
444
+ step = {
445
+ "Name" => get_field(:step_name, "Run Hive Script"),
446
+ "ActionOnFailure" => get_field(:step_action, "CANCEL_AND_WAIT"),
447
+ "HadoopJarStep" => {
448
+ "Jar" => get_field(:script_runner_path),
449
+ "Args" => get_field(:hive_cmd) + get_version_args(true) + mandatory_args + @args
450
+ }
451
+ }
452
+ [ step ]
453
+ end
454
+
455
+ def reorder_steps(jobflow, sc)
456
+ return ensure_install_cmd(jobflow, sc, HiveInteractiveCommand)
457
+ end
458
+ end
459
+
460
+ class HiveInteractiveCommand < HiveCommand
461
+
462
+ def steps
463
+ step = {
464
+ "Name" => get_field(:step_name, "Setup Hive"),
465
+ "ActionOnFailure" => get_field(:step_action, "TERMINATE_JOB_FLOW"),
466
+ "HadoopJarStep" => {
467
+ "Jar" => get_field(:script_runner_path),
468
+ "Args" => get_field(:hive_cmd) + [ "--install-hive" ] +
469
+ get_version_args(false) + extra_args
470
+ }
471
+ }
472
+ [ step ]
473
+ end
474
+
475
+ def jobflow_has_install_step(jobflow)
476
+ install_steps = jobflow['Steps'].select do |step|
477
+ step["ExecutionStatusDetail"]["State"] != "FAILED" &&
478
+ has_value(step, 'StepConfig', 'HadoopJarStep', 'Jar', get_field(:script_runner_path)) &&
479
+ has_value(step, 'StepConfig', 'HadoopJarStep', 'Args', 3, "--install-hive") &&
480
+ has_value(step, 'StepConfig', 'HadoopJarStep', 'Args', 5, get_version_args(true)[1])
481
+ end
482
+ return install_steps.size > 0
483
+ end
484
+
485
+ def self.new_from_commands(commands, parent)
486
+ sc = self.new("--hive-interactive", "Run a jobflow with Hive Installed", nil, commands)
487
+ sc.hive_versions = parent.hive_versions
488
+ sc.step_action = parent.step_action
489
+ return sc
490
+ end
491
+ end
492
+
493
+ class JarStepCommand < StepCommand
494
+ attr_accessor :main_class
495
+
496
+ def steps
497
+ step = {
498
+ "Name" => get_field(:step_name, "Example Jar Step"),
499
+ "ActionOnFailure" => get_field(:step_action, "CANCEL_AND_WAIT"),
500
+ "HadoopJarStep" => {
501
+ "Jar" => get_field(:arg),
502
+ "Args" => get_field(:args, [])
503
+ }
504
+ }
505
+ if get_field(:main_class) then
506
+ step["HadoopJarStep"]["MainClass"] = get_field(:main_class)
507
+ end
508
+ return [ step ]
509
+ end
510
+ end
511
+
512
+ class StreamStepCommand < StepCommand
513
+ attr_accessor :input, :output, :mapper, :cache, :cache_archive, :jobconf, :reducer, :args
514
+
515
+ GENERIC_OPTIONS = Set.new(%w(-conf -D -fs -jt -files -libjars -archives))
516
+
517
+ def steps
518
+ timestr = Time.now.strftime("%Y-%m-%dT%H%M%S")
519
+ stream_options = []
520
+ for ca in get_field(:cache, []) do
521
+ stream_options << "-cacheFile" << ca
522
+ end
523
+
524
+ for ca in get_field(:cache_archive, []) do
525
+ stream_options << "-cacheArchive" << ca
526
+ end
527
+
528
+ for jc in get_field(:jobconf, []) do
529
+ stream_options << "-jobconf" << jc
530
+ end
531
+
532
+ # Note that the streaming options should go before command options for
533
+ # Hadoop 0.20
534
+ step = {
535
+ "Name" => get_field(:step_name, "Example Streaming Step"),
536
+ "ActionOnFailure" => get_field(:step_action, "CANCEL_AND_WAIT"),
537
+ "HadoopJarStep" => {
538
+ "Jar" => "/home/hadoop/contrib/streaming/hadoop-streaming.jar",
539
+ "Args" => (sort_streaming_args(get_field(:args))) + (stream_options) + [
540
+ "-input", get_field(:input, "s3n://elasticmapreduce/samples/wordcount/input"),
541
+ "-output", get_field(:output, "hdfs:///examples/output/#{timestr}"),
542
+ "-mapper", get_field(:mapper, "s3n://elasticmapreduce/samples/wordcount/wordSplitter.py"),
543
+ "-reducer", get_field(:reducer, "aggregate")
544
+ ]
545
+ }
546
+ }
547
+ return [ step ]
548
+ end
549
+
550
+ def sort_streaming_args(streaming_args)
551
+ sorted_streaming_args = []
552
+ i=0
553
+ while streaming_args && i < streaming_args.length
554
+ if GENERIC_OPTIONS.include?(streaming_args[i]) then
555
+ if i+1 < streaming_args.length
556
+ sorted_streaming_args.unshift(streaming_args[i+1])
557
+ sorted_streaming_args.unshift(streaming_args[i])
558
+ i=i+2
559
+ else
560
+ raise RuntimeError, "Missing value for argument #{streaming_args[i]}"
561
+ end
562
+ else
563
+ sorted_streaming_args << streaming_args[i]
564
+ i=i+1
565
+ end
566
+ end
567
+ return sorted_streaming_args
568
+ end
569
+ end
570
+
571
+ class AbstractSSHCommand < Command
572
+ attr_accessor :no_wait, :dest, :hostname, :key_pair_file, :jobflow_id, :jobflow_detail
573
+
574
+ CLOSED_DOWN_STATES = Set.new(%w(TERMINATED SHUTTING_DOWN COMPLETED FAILED))
575
+ WAITING_OR_RUNNING_STATES = Set.new(%w(WAITING RUNNING))
576
+
577
+ def exec(cmd)
578
+ commands.exec(cmd)
579
+ end
580
+
581
+ def wait_for_jobflow(client)
582
+ while true do
583
+ state = resolve(self.jobflow_detail, "ExecutionStatusDetail", "State")
584
+ if WAITING_OR_RUNNING_STATES.include?(state) then
585
+ break
586
+ elsif CLOSED_DOWN_STATES.include?(state) then
587
+ raise RuntimeError, "Jobflow entered #{state} while waiting to ssh"
588
+ else
589
+ logger.info("Jobflow is in state #{state}, waiting....")
590
+ sleep(30)
591
+ self.jobflow_detail = client.describe_jobflow_with_id(jobflow_id)
592
+ end
593
+ end
594
+ end
595
+
596
+ def enact(client)
597
+ self.jobflow_id = require_single_jobflow
598
+ self.jobflow_detail = client.describe_jobflow_with_id(self.jobflow_id)
599
+ if ! get_field(:no_wait) then
600
+ wait_for_jobflow(client)
601
+ end
602
+ self.hostname = self.jobflow_detail['Instances']['MasterPublicDnsName']
603
+ self.key_pair_file = require(:key_pair_file, "Missing required option --key-pair-file for #{name}")
604
+ end
605
+ end
606
+
607
+ class SSHCommand < AbstractSSHCommand
608
+ attr_accessor :cmd
609
+
610
+ def initialize(*args)
611
+ super(*args)
612
+ if @arg =~ /j-[A-Z0-9]{8,20}/ then
613
+ commands.global_options[:jobflow] << @arg
614
+ else
615
+ self.cmd = @arg
616
+ end
617
+ end
618
+
619
+ def enact(client)
620
+ super(client)
621
+ exec "ssh -i #{key_pair_file} hadoop@#{hostname} #{get_field(:cmd, "")}"
622
+ end
623
+ end
624
+
625
+ class PutCommand < AbstractSSHCommand
626
+ def enact(client)
627
+ super(client)
628
+ if get_field(:dest) then
629
+ exec "scp -i #{key_pair_file} #{@arg} hadoop@#{hostname}:#{get_field(:dest)}"
630
+ else
631
+ exec "scp -i #{key_pair_file} #{@arg} hadoop@#{hostname}:#{File.basename(@arg)}"
632
+ end
633
+ end
634
+ end
635
+
636
+ class GetCommand < AbstractSSHCommand
637
+ def enact(client)
638
+ super(client)
639
+ if get_field(:dest) then
640
+ exec "scp -i #{key_pair_file} hadoop@#{hostname}:#{@arg} #{get_field(:dest)}"
641
+ else
642
+ exec "scp -i #{key_pair_file} hadoop@#{hostname}:#{@arg} #{File.basename(@arg)}"
643
+ end
644
+ end
645
+ end
646
+
647
+ class PrintHiveVersionCommand < AbstractSSHCommand
648
+ def enact(client)
649
+ super(client)
650
+ stdin, stdout, stderr = Open3.popen3("ssh -i #{key_pair_file} hadoop@#{hostname} '/home/hadoop/bin/hive -v'")
651
+ version = stdout.readlines.join
652
+ err = stderr.readlines.join
653
+ if version.length > 0
654
+ puts version
655
+ elsif err =~ /Unrecognised option/ or err =~ /Error while determing Hive version/
656
+ stdin, stdout, stderr = Open3.popen3("ssh -i #{key_pair_file} hadoop@#{hostname} 'ls -l /home/hadoop/bin/hive'")
657
+ version = stdout.readlines.join
658
+ version =~ /hive-(.*)\/bin\/hive/
659
+ puts "Hive version " + $1
660
+ else
661
+ puts "Unable to determine Hive version"
662
+ end
663
+ end
664
+ end
665
+
666
+ class LogsCommand < AbstractSSHCommand
667
+ attr_accessor :step_index
668
+
669
+ INTERESTING_STEP_STATES = ['RUNNING', 'COMPLETED', 'FAILED']
670
+
671
+ def enact(client)
672
+ super(client)
673
+
674
+ # find the last interesting step if that exists
675
+ if get_field(:step_index) == nil then
676
+ steps = resolve(jobflow_detail, "Steps")
677
+ self.step_index = (0 ... steps.size).select { |index|
678
+ INTERESTING_STEP_STATES.include?(resolve(steps, index, 'ExecutionStatusDetail', 'State'))
679
+ }.last + 1
680
+ end
681
+
682
+ if get_field(:step_index) then
683
+ logger.puts "Listing steps for step #{get_field(:step_index)}"
684
+ exec "ssh -i #{key_pair_file} hadoop@#{hostname} cat /mnt/var/log/hadoop/steps/#{get_field(:step_index)}/{syslog,stderr,stdout}"
685
+ else
686
+ raise RuntimeError, "No steps that could have logs found in jobflow"
687
+ end
688
+ end
689
+ end
690
+
691
+ class GlobalOption < CommandOption
692
+ def attach(commands)
693
+ global_options = @commands.global_options
694
+ value = global_options[@field_symbol]
695
+ if value.is_a?(Array) then
696
+ value << @arg
697
+ elsif value == nil then
698
+ global_options[@field_symbol] = @arg
699
+ else
700
+ raise RuntimeError, "You may not specify #{@name} twice"
701
+ end
702
+ return nil
703
+ end
704
+ end
705
+
706
+ class GlobalFlagOption < CommandOption
707
+ def attach(command)
708
+ global_options = @commands.global_options
709
+ value = global_options[@field_symbol]
710
+ if value == nil then
711
+ global_options[@field_symbol] = @arg
712
+ else
713
+ raise RuntimeError, "You may not specify #{@name} twice"
714
+ end
715
+ end
716
+ end
717
+
718
+ class StepProcessingCommand < Command
719
+ attr_accessor :step_commands
720
+
721
+ def initialize(*args)
722
+ super(*args)
723
+ @step_commands = []
724
+ end
725
+
726
+ def reorder_steps(jobflow, sc)
727
+ new_step_commands = sc.dup
728
+ for step_command in sc do
729
+ new_step_commands = step_command.reorder_steps(jobflow, new_step_commands)
730
+ end
731
+
732
+ return new_step_commands
733
+ end
734
+ end
735
+
736
+ class AddJobFlowStepsCommand < StepProcessingCommand
737
+
738
+ def add_step_command(step)
739
+ @step_commands << step
740
+ end
741
+
742
+ def validate
743
+ for cmd in step_commands do
744
+ cmd.validate
745
+ end
746
+ end
747
+
748
+ def enact(client)
749
+ jobflow_id = require_single_jobflow
750
+ jobflow = client.describe_jobflow_with_id(jobflow_id)
751
+ self.step_commands = reorder_steps(jobflow, self.step_commands)
752
+ jobflow_steps = step_commands.map { |x| x.steps }.flatten
753
+ client.add_steps(jobflow_id, jobflow_steps)
754
+ logger.puts("Added jobflow steps")
755
+ end
756
+ end
757
+
758
+ class CreateJobFlowCommand < StepProcessingCommand
759
+ attr_accessor :jobflow_name, :alive, :with_termination_protection, :instance_count, :slave_instance_type,
760
+ :master_instance_type, :key_pair, :key_pair_file, :log_uri, :az, :ainfo,
761
+ :hadoop_version, :plain_output, :instance_type,
762
+ :instance_group_commands, :bootstrap_commands
763
+
764
+
765
+ OLD_OPTIONS = [:instance_count, :slave_instance_type, :master_instance_type]
766
+ # FIXME: add code to setup collapse instance group commands
767
+
768
+ DEFAULT_HADOOP_VERSION = "0.20"
769
+
770
+ def initialize(*args)
771
+ super(*args)
772
+ @instance_group_commands = []
773
+ @bootstrap_commands = []
774
+ end
775
+
776
+ def add_step_command(step)
777
+ @step_commands << step
778
+ end
779
+
780
+ def add_bootstrap_command(bootstrap_command)
781
+ @bootstrap_commands << bootstrap_command
782
+ end
783
+
784
+ def add_instance_group_command(instance_group_command)
785
+ @instance_group_commands << instance_group_command
786
+ end
787
+
788
+ def validate
789
+ for step in step_commands do
790
+ if step.is_a?(EnableDebuggingCommand) then
791
+ require(:log_uri, "You must supply a logUri if you enable debugging when creating a job flow")
792
+ end
793
+ end
794
+
795
+ for cmd in step_commands + instance_group_commands + bootstrap_commands do
796
+ cmd.validate
797
+ end
798
+
799
+ if ! have(:hadoop_version) then
800
+ @hadoop_version = DEFAULT_HADOOP_VERSION
801
+ end
802
+ end
803
+
804
+ def enact(client)
805
+ @jobflow = create_jobflow
806
+
807
+ apply_jobflow_option(:ainfo, "AdditionalInfo")
808
+ apply_jobflow_option(:key_pair, "Instances", "Ec2KeyName")
809
+ apply_jobflow_option(:hadoop_version, "Instances", "HadoopVersion")
810
+ apply_jobflow_option(:az, "Instances", "Placement", "AvailabilityZone")
811
+ apply_jobflow_option(:log_uri, "LogUri")
812
+
813
+ self.step_commands = reorder_steps(@jobflow, self.step_commands)
814
+ @jobflow["Steps"] = step_commands.map { |x| x.steps }.flatten
815
+
816
+ setup_instance_groups
817
+ @jobflow["Instances"]["InstanceGroups"] = instance_group_commands.map { |x| x.instance_group }
818
+
819
+ bootstrap_action_index = 1
820
+ for bootstrap_action_command in bootstrap_commands do
821
+ @jobflow["BootstrapActions"] << bootstrap_action_command.bootstrap_action(
822
+ bootstrap_action_index)
823
+ bootstrap_action_index += 1
824
+ end
825
+
826
+ run_result = client.run_jobflow(@jobflow)
827
+ jobflow_id = run_result['JobFlowId']
828
+ commands.global_options[:jobflow] << jobflow_id
829
+
830
+ if have(:plain_output) then
831
+ logger.puts jobflow_id
832
+ else
833
+ logger.puts "Created job flow " + jobflow_id
834
+ end
835
+ end
836
+
837
+ def apply_jobflow_option(field_symbol, *keys)
838
+ value = get_field(field_symbol)
839
+ if value != nil then
840
+ map = @jobflow
841
+ for key in keys[0..-2] do
842
+ nmap = map[key]
843
+ if nmap == nil then
844
+ map[key] = {}
845
+ nmap = map[key]
846
+ end
847
+ map = nmap
848
+ end
849
+ map[keys.last] = value
850
+ end
851
+ end
852
+
853
+ def new_instance_group_command(role, instance_count, instance_type)
854
+ igc = CreateInstanceGroupCommand.new(
855
+ "--instance-group ROLE", "Specify an instance group", role, commands
856
+ )
857
+ igc.instance_count = instance_count
858
+ igc.instance_type = instance_type
859
+ return igc
860
+ end
861
+
862
+ def have_role(instance_group_commands, role)
863
+ instance_group_commands.select { |x|
864
+ x.instance_role.upcase == role
865
+ }.size > 0
866
+ end
867
+
868
+ def setup_instance_groups
869
+ instance_groups = []
870
+ if ! have_role(instance_group_commands, "MASTER") then
871
+ mit = get_field(:master_instance_type, get_field(:instance_type, "m1.small"))
872
+ master_instance_group = new_instance_group_command("MASTER", 1, mit)
873
+ instance_group_commands << master_instance_group
874
+ end
875
+ if ! have_role(instance_group_commands, "CORE") then
876
+ ni = get_field(:instance_count, 1).to_i
877
+ if ni > 1 then
878
+ sit = get_field(:slave_instance_type, get_field(:instance_type, "m1.small"))
879
+ slave_instance_group = new_instance_group_command("CORE", ni-1, sit)
880
+ slave_instance_group.instance_role = "CORE"
881
+ instance_group_commands << slave_instance_group
882
+ end
883
+ else
884
+ # Verify that user has not specified both --instance-group core and --num-instances
885
+ if get_field(:instance_count) != nil then
886
+ raise RuntimeError, "option --num-instances cannot be used when a core instance group is specified."
887
+ end
888
+ end
889
+ end
890
+
891
+ def create_jobflow
892
+ @jobflow = {
893
+ "Name" => get_field(:jobflow_name, default_job_flow_name),
894
+ "Instances" => {
895
+ "KeepJobFlowAliveWhenNoSteps" => (get_field(:alive) ? "true" : "false"),
896
+ "TerminationProtected" => (get_field(:with_termination_protection) ? "true" : "false"),
897
+ "InstanceGroups" => []
898
+ },
899
+ "Steps" => [],
900
+ "BootstrapActions" => []
901
+ }
902
+ end
903
+
904
+ def default_job_flow_name
905
+ name = "Development Job Flow"
906
+ if get_field(:alive) then
907
+ name += " (requires manual termination)"
908
+ end
909
+ return name
910
+ end
911
+ end
912
+
913
+ class BootstrapActionCommand < Command
914
+ attr_accessor :bootstrap_name, :args
915
+
916
+ def initialize(*args)
917
+ super(*args)
918
+ @args = []
919
+ end
920
+
921
+ def bootstrap_action(index)
922
+ action = {
923
+ "Name" => get_field(:bootstrap_name, "Bootstrap Action #{index}"),
924
+ "ScriptBootstrapAction" => {
925
+ "Path" => @arg,
926
+ "Args" => @args
927
+ }
928
+ }
929
+ return action
930
+ end
931
+ end
932
+
933
+ class AbstractListCommand < Command
934
+ attr_accessor :state, :max_results, :active, :all, :no_steps
935
+
936
+ def enact(client)
937
+ options = {}
938
+ states = []
939
+ if get_field(:jobflow, []).size > 0 then
940
+ options = { 'JobFlowIds' => get_field(:jobflow) }
941
+ else
942
+ if get_field(:active) then
943
+ states = %w(RUNNING SHUTTING_DOWN STARTING WAITING BOOTSTRAPPING)
944
+ end
945
+ if get_field(:states) then
946
+ states += get_field(states)
947
+ end
948
+ if get_field(:active) || get_field(:states) then
949
+ options = { 'JobFlowStates' => states }
950
+ elsif get_field(:all) then
951
+ options = { }
952
+ else
953
+ options = { 'CreatedAfter' => (Time.now - (2 * 24 * 3600)).xmlschema }
954
+ end
955
+ end
956
+ result = client.describe_jobflow(options)
957
+ # add the described jobflow to the supplied jobflows
958
+ commands.global_options[:jobflow] += result['JobFlows'].map { |x| x['JobFlowId'] }
959
+ commands.global_options[:jobflow].uniq!
960
+
961
+ return result
962
+ end
963
+ end
964
+
965
+ class ListActionCommand < AbstractListCommand
966
+
967
+ def format(map, *fields)
968
+ result = []
969
+ for field in fields do
970
+ key = field[0].split(".")
971
+ value = map
972
+ while key.size > 0 do
973
+ value = value[key.first]
974
+ key.shift
975
+ end
976
+ result << sprintf("%-#{field[1]}s", value)
977
+ end
978
+ result.join("")
979
+ end
980
+
981
+ def enact(client)
982
+ result = super(client)
983
+ job_flows = result['JobFlows']
984
+ count = 0
985
+ for job_flow in job_flows do
986
+ if get_field(:max_results) && (count += 1) > get_field(:max_results) then
987
+ break
988
+ end
989
+ logger.puts format(job_flow, ['JobFlowId', 20], ['ExecutionStatusDetail.State', 15],
990
+ ['Instances.MasterPublicDnsName', 50]) + job_flow['Name']
991
+ if ! get_field(:no_steps) then
992
+ for step in job_flow['Steps'] do
993
+ logger.puts " " + format(step, ['ExecutionStatusDetail.State', 15], ['StepConfig.Name', 30])
994
+ end
995
+ end
996
+ end
997
+ end
998
+ end
999
+
1000
+ class DescribeActionCommand < AbstractListCommand
1001
+ def enact(client)
1002
+ result = super(client)
1003
+ logger.puts(JSON.pretty_generate(result))
1004
+ end
1005
+ end
1006
+
1007
+ class SetTerminationProtection < Command
1008
+ def enact(client)
1009
+ job_flow = get_field(:jobflow)
1010
+ termination_protected = @arg == 'true'
1011
+ client.set_termination_protection(job_flow, termination_protected)
1012
+ logger.puts "#{termination_protected ? "Disabled":"Enabled"} job flow termination " + job_flow.join(" ")
1013
+ end
1014
+ end
1015
+
1016
+ class TerminateActionCommand < Command
1017
+ def enact(client)
1018
+ job_flow = get_field(:jobflow)
1019
+ client.terminate_jobflows(job_flow)
1020
+ logger.puts "Terminated job flow " + job_flow.join(" ")
1021
+ end
1022
+ end
1023
+
1024
+ class VersionCommand < Command
1025
+ def enact(client)
1026
+ logger.puts "Version #{ELASTIC_MAPREDUCE_CLIENT_VERSION}"
1027
+ end
1028
+ end
1029
+
1030
+ class HelpCommand < Command
1031
+ def enact(client)
1032
+ logger.puts commands.opts
1033
+ end
1034
+ end
1035
+
1036
+ class ArgsOption < CommandOption
1037
+ def attach(commands)
1038
+ command = super(commands)
1039
+ command.args += @arg.split(",")
1040
+ return command
1041
+ end
1042
+ end
1043
+
1044
+ class ArgOption < CommandOption
1045
+ def attach(commands)
1046
+ command = super(commands)
1047
+ command.args << @arg
1048
+ return command
1049
+ end
1050
+ end
1051
+
1052
+ class AbstractInstanceGroupCommand < Command
1053
+ attr_accessor :instance_group_id, :instance_type, :instance_role,
1054
+ :instance_count, :instance_group_name
1055
+
1056
+ def initialize(*args)
1057
+ super(*args)
1058
+ if @arg =~ /^ig-/ then
1059
+ @instance_group_id = @arg
1060
+ else
1061
+ @instance_role = @arg.upcase
1062
+ end
1063
+ end
1064
+
1065
+ def default_instance_group_name
1066
+ get_field(:instance_role).downcase.capitalize + " Instance Group"
1067
+ end
1068
+
1069
+ def instance_group
1070
+ return {
1071
+ "Name" => get_field(:instance_group_name),
1072
+ "Market" => get_field(:instance_group_market, "ON_DEMAND"),
1073
+ "InstanceRole" => get_field(:instance_role),
1074
+ "InstanceCount" => get_field(:instance_count),
1075
+ "InstanceType" => get_field(:instance_type)
1076
+ }
1077
+ end
1078
+
1079
+ def require_singleton_array(arr, msg)
1080
+ if arr.size != 1 then
1081
+ raise RuntimeError, "Expected to find one " + msg + " but found #{arr.size}."
1082
+ end
1083
+ end
1084
+
1085
+ end
1086
+
1087
+ class AddInstanceGroupCommand < AbstractInstanceGroupCommand
1088
+ def validate
1089
+ if ! ["TASK"].include?(get_field(:instance_role)) then
1090
+ raise RuntimeError, "Invalid argument to #{name}, expected 'task'"
1091
+ end
1092
+ require(:instance_type, "Option #{name} is missing --instance-type")
1093
+ require(:instance_count, "Option #{name} is missing --instance-count")
1094
+ end
1095
+
1096
+ def enact(client)
1097
+ client.add_instance_groups(
1098
+ 'JobFlowId' => require_single_jobflow, 'InstanceGroups' => [instance_group]
1099
+ )
1100
+ logger.puts("Added instance group " + get_field(:instance_role))
1101
+ end
1102
+ end
1103
+
1104
+ class CreateInstanceGroupCommand < AbstractInstanceGroupCommand
1105
+ def validate
1106
+ if ! ["MASTER", "CORE", "TASK"].include?(get_field(:instance_role)) then
1107
+ raise RuntimeError, "Invalid argument to #{name}, expected master, core or task"
1108
+ end
1109
+ require(:instance_type, "Option #{name} is missing --instance-type")
1110
+ require(:instance_count, "Option #{name} is missing --instance-count")
1111
+ end
1112
+ end
1113
+
1114
+ class ModifyInstanceGroupCommand < AbstractInstanceGroupCommand
1115
+ attr_accessor :jobflow_detail, :jobflow_id
1116
+
1117
+ def validate
1118
+ if get_field(:instance_group_id) == nil then
1119
+ if ! ["CORE", "TASK"].include?(get_field(:instance_role)) then
1120
+ raise RuntimeError, "Invalid argument to #{name}, #{@arg} is not valid"
1121
+ end
1122
+ if get_field(:jobflow, []).size == 0 then
1123
+ raise RuntimeError, "You must specify a jobflow when using #{name} and specifying a role #{instance_role}"
1124
+ end
1125
+ end
1126
+ require(:instance_count, "Option #{name} is missing --instance-count")
1127
+ end
1128
+
1129
+ def enact(client)
1130
+ if get_field(:instance_group_id) == nil then
1131
+ self.jobflow_id = require_single_jobflow
1132
+ self.jobflow_detail = client.describe_jobflow_with_id(self.jobflow_id)
1133
+ matching_instance_groups =
1134
+ jobflow_detail['Instances']['InstanceGroups'].select { |x| x['InstanceRole'] == instance_role }
1135
+ require_singleton_array(matching_instance_groups, "instance group with role #{instance_role}")
1136
+ self.instance_group_id = matching_instance_groups.first['InstanceGroupId']
1137
+ end
1138
+ options = {
1139
+ 'InstanceGroups' => [{
1140
+ 'InstanceGroupId' => get_field(:instance_group_id),
1141
+ 'InstanceCount' => get_field(:instance_count)
1142
+ }]
1143
+ }
1144
+ client.modify_instance_groups(options)
1145
+ ig_modified = nil
1146
+ if get_field(:instance_role) != nil then
1147
+ ig_modified = get_field(:instance_role)
1148
+ else
1149
+ ig_modified = get_field(:instance_group_id)
1150
+ end
1151
+ logger.puts("Modified instance group " + ig_modified)
1152
+ end
1153
+ end
1154
+
1155
+ class UnarrestInstanceGroupCommand < AbstractInstanceGroupCommand
1156
+
1157
+ attr_accessor :jobflow_id, :jobflow_detail
1158
+
1159
+ def validate
1160
+ require_single_jobflow
1161
+ if get_field(:instance_group_id) == nil then
1162
+ if ! ["CORE", "TASK"].include?(get_field(:instance_role)) then
1163
+ raise RuntimeError, "Invalid argument to #{name}, #{@arg} is not valid"
1164
+ end
1165
+ end
1166
+ end
1167
+
1168
+ def enact(client)
1169
+ self.jobflow_id = require_single_jobflow
1170
+ self.jobflow_detail = client.describe_jobflow_with_id(self.jobflow_id)
1171
+
1172
+ matching_instance_groups = nil
1173
+ if get_field(:instance_group_id) == nil then
1174
+ matching_instance_groups =
1175
+ jobflow_detail['Instances']['InstanceGroups'].select { |x| x['InstanceRole'] == instance_role }
1176
+ else
1177
+ matching_instance_groups =
1178
+ jobflow_detail['Instances']['InstanceGroups'].select { |x| x['InstanceGroupId'] == get_field(:instance_group_id) }
1179
+ end
1180
+
1181
+ require_singleton_array(matching_instance_groups, "instance group with role #{instance_role}")
1182
+ instance_group_detail = matching_instance_groups.first
1183
+ self.instance_group_id = instance_group_detail['InstanceGroupId']
1184
+ self.instance_count = instance_group_detail['InstanceRequestCount']
1185
+
1186
+ options = {
1187
+ 'InstanceGroups' => [{
1188
+ 'InstanceGroupId' => get_field(:instance_group_id),
1189
+ 'InstanceCount' => get_field(:instance_count)
1190
+ }]
1191
+ }
1192
+ client.modify_instance_groups(options)
1193
+ logger.puts "Unarrested instance group #{get_field(:instance_group_id)}."
1194
+ end
1195
+ end
1196
+
1197
+ class InstanceCountOption < CommandOption
1198
+ def attach(commands)
1199
+ command = super(commands)
1200
+ command.instance_count = @arg.to_i
1201
+ return command
1202
+ end
1203
+ end
1204
+
1205
+ class InstanceTypeOption < CommandOption
1206
+ def attach(commands)
1207
+ command = super(commands)
1208
+ command.instance_type = @arg
1209
+ return command
1210
+ end
1211
+ end
1212
+
1213
+ class OptionWithArg < CommandOption
1214
+ def attach(commands)
1215
+ command = super(commands)
1216
+ if @pattern && ! @arg.match(@pattern) then
1217
+ raise RuntimeError, "Expected argument to #{@name} to match #{@pattern.inspect}, but it didn't"
1218
+ end
1219
+ command.option(@name, @field_symbol, @arg)
1220
+ return command
1221
+ end
1222
+ end
1223
+
1224
+ class FlagOption < CommandOption
1225
+
1226
+ def initialize(name, description, arg, parent_commands, commands, field_symbol)
1227
+ super(name, description, arg, parent_commands, commands)
1228
+ @field_symbol = field_symbol
1229
+ end
1230
+
1231
+ def attach(commands)
1232
+ command = super(commands)
1233
+ command.option(@name, @field_symbol, true)
1234
+ end
1235
+ end
1236
+
1237
+ class JsonStepCommand < StepCommand
1238
+ attr_accessor :variables
1239
+
1240
+ def initialize(*args)
1241
+ super(*args)
1242
+ @variables = []
1243
+ end
1244
+
1245
+ def steps
1246
+ content = steps = nil
1247
+ filename = get_field(:arg)
1248
+ begin
1249
+ content = File.read(filename)
1250
+ rescue Exception => e
1251
+ raise RuntimeError, "Couldn't read json file #{filename}"
1252
+ end
1253
+ for var in get_field(:variables, []) do
1254
+ content.gsub!(var[:key], var[:value])
1255
+ end
1256
+ begin
1257
+ steps = JSON.parse(content)
1258
+ rescue Exception => e
1259
+ raise RuntimeError, "Error parsing json from file #{filename}"
1260
+ end
1261
+ if steps.is_a?(Array) then
1262
+ return steps
1263
+ else
1264
+ return [ steps ]
1265
+ end
1266
+ end
1267
+ end
1268
+
1269
+ class ParamOption < CommandOption
1270
+ def initialize(*args)
1271
+ super(*args)
1272
+ @params = []
1273
+ end
1274
+
1275
+ def attach(commands)
1276
+ command = super(commands)
1277
+ if match = @arg.match(/([^=]+)=(.*)/) then
1278
+ command.option(@name, @field_symbol, { :key => match[1], :value => match[2] })
1279
+ else
1280
+ raise RuntimeError, "Expected '#{@arg}' to be in the form VARIABLE=VALUE"
1281
+ end
1282
+ return command
1283
+ end
1284
+ end
1285
+
1286
+ class EipCommand < Command
1287
+ attr_accessor :no_wait, :instance_id, :key_pair_file, :jobflow_id, :jobflow_detail
1288
+
1289
+ CLOSED_DOWN_STATES = Set.new(%w(TERMINATED SHUTTING_DOWN COMPLETED FAILED))
1290
+ WAITING_OR_RUNNING_STATES = Set.new(%w(WAITING RUNNING))
1291
+
1292
+ def initialize(*args)
1293
+ super(*args)
1294
+ end
1295
+
1296
+ def exec(cmd)
1297
+ commands.exec(cmd)
1298
+ end
1299
+
1300
+ def wait_for_jobflow(client)
1301
+ while true do
1302
+ state = resolve(self.jobflow_detail, "ExecutionStatusDetail", "State")
1303
+ if WAITING_OR_RUNNING_STATES.include?(state) then
1304
+ break
1305
+ elsif CLOSED_DOWN_STATES.include?(state) then
1306
+ raise RuntimeError, "Jobflow entered #{state} while waiting to assign Elastic IP"
1307
+ else
1308
+ logger.info("Jobflow is in state #{state}, waiting....")
1309
+ sleep(30)
1310
+ self.jobflow_detail = client.describe_jobflow_with_id(jobflow_id)
1311
+ end
1312
+ end
1313
+ end
1314
+
1315
+ def enact(client)
1316
+ self.jobflow_id = require_single_jobflow
1317
+ self.jobflow_detail = client.describe_jobflow_with_id(self.jobflow_id)
1318
+ if ! get_field(:no_wait) then
1319
+ wait_for_jobflow(client)
1320
+ end
1321
+ self.instance_id = self.jobflow_detail['Instances']['MasterInstanceId']
1322
+ if ! self.instance_id then
1323
+ logger.error("The master instance is not available yet for jobflow #{self.jobflow_id}. It might still be starting.")
1324
+ exit(-1)
1325
+ end
1326
+
1327
+ ec2_endpoint = "https://ec2.amazonaws.com"
1328
+ az = self.jobflow_detail['Instances']['Placement']['AvailabilityZone']
1329
+ reg_length = "us-east-1".length
1330
+ if az[0, reg_length] == "us-east-1" then
1331
+ ec2_endpoint = "https://ec2.us-east-1.amazonaws.com"
1332
+ elsif az[0, reg_length] == "us-west-1" then
1333
+ ec2_endpoint = "https://ec2.us-west-1.amazonaws.com"
1334
+ elsif az[0, reg_length] == "eu-west-1" then
1335
+ ec2_endpoint = "https://ec2.eu-west-1.amazonaws.com"
1336
+ elsif az[0, reg_length] == "ap-southeast-1" then
1337
+ ec2_endpoint = "https://ec2.ap-southeast-1.amazonaws.com"
1338
+ elsif az[0, reg_length] == "ap-northeast-1" then
1339
+ ec2_endpoint = "https://ec2.ap-northeast-1.amazonaws.com"
1340
+ end
1341
+ commands.global_options[:ec2_endpoint] = ec2_endpoint
1342
+
1343
+ self.key_pair_file = require(:key_pair_file, "Missing required option --key-pair-file for #{name}")
1344
+ eip = get_field(:arg)
1345
+
1346
+ ec2_client = Ec2ClientWrapper.new(commands, logger)
1347
+
1348
+ if ! eip then
1349
+ begin
1350
+ response = ec2_client.allocate_address()
1351
+ rescue Exception => e
1352
+ logger.error("Error during AllocateAddres: " + e.message)
1353
+ if get_field(:trace) then
1354
+ logger.puts(e.backtrace.join("\n"))
1355
+ end
1356
+ exit(-1)
1357
+ end
1358
+
1359
+ eip = response['publicIp']
1360
+ logger.info("Allocated Public IP: #{eip}...")
1361
+ end
1362
+
1363
+ begin
1364
+ response = ec2_client.associate_address(self.instance_id, eip)
1365
+ logger.info("Public IP: #{eip} was assigned to jobflow #{self.jobflow_id}")
1366
+ rescue Exception => e
1367
+ logger.error("Error during AssociateAddres: " + e.to_s)
1368
+ if get_field(:trace) then
1369
+ logger.puts(e.backtrace.join("\n"))
1370
+ end
1371
+ exit(-1)
1372
+ end
1373
+
1374
+ end
1375
+ end
1376
+
1377
+ def self.add_commands(commands, opts)
1378
+ # FIXME: add --wait-for-step function
1379
+
1380
+ commands.opts = opts
1381
+
1382
+ step_commands = ["--jar", "--resize-jobflow", "--enable-debugging", "--hive-interactive", "--pig-interactive", "--hive-script", "--pig-script"]
1383
+
1384
+ opts.separator "\n Creating Job Flows\n"
1385
+
1386
+ commands.parse_command(CreateJobFlowCommand, "--create", "Create a new job flow")
1387
+ commands.parse_options(["--create"], [
1388
+ [ OptionWithArg, "--name NAME", "The name of the job flow being created", :jobflow_name ],
1389
+ [ FlagOption, "--alive", "Create a job flow that stays running even though it has executed all its steps", :alive ],
1390
+ [ OptionWithArg, "--with-termination-protection", "Create a job with termination protection (default is no termination protection)", :with_termination_protection ],
1391
+ [ OptionWithArg, "--num-instances NUM", "Number of instances in the job flow", :instance_count ],
1392
+ [ OptionWithArg, "--slave-instance-type TYPE", "The type of the slave instances to launch", :slave_instance_type ],
1393
+ [ OptionWithArg, "--master-instance-type TYPE", "The type of the master instance to launch", :master_instance_type ],
1394
+ [ OptionWithArg, "--key-pair KEY_PAIR", "The name of your Amazon EC2 Keypair", :key_pair ],
1395
+ [ OptionWithArg, "--availability-zone A_Z", "Specify the Availability Zone in which to launch the job flow", :az ],
1396
+ [ OptionWithArg, "--info INFO", "Specify additional info to job flow creation", :ainfo ],
1397
+ [ OptionWithArg, "--hadoop-version INFO", "Specify the Hadoop Version to install", :hadoop_version ],
1398
+ [ FlagOption, "--plain-output", "Return the job flow id from create step as simple text", :plain_output ],
1399
+ ])
1400
+ commands.parse_command(CreateInstanceGroupCommand, "--instance-group ROLE", "Specify an instance group while creating a jobflow")
1401
+
1402
+ opts.separator "\n Passing arguments to steps\n"
1403
+
1404
+ commands.parse_options(step_commands + ["--bootstrap-action", "--stream"], [
1405
+ [ ArgsOption, "--args ARGS", "A command separated list of arguments to pass to the step" ],
1406
+ [ ArgOption, "--arg ARG", "An argument to pass to the step" ],
1407
+ [ OptionWithArg, "--step-name STEP_NAME", "Set name for the step", :step_name ],
1408
+ [ OptionWithArg, "--step-action STEP_ACTION", "Action to take when step finishes. One of CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE", :step_action ],
1409
+ ])
1410
+
1411
+ opts.separator "\n Specific Steps\n"
1412
+
1413
+ commands.parse_command(ResizeJobflowCommand, "--resize-jobflow", "Add a step to resize the job flow")
1414
+ commands.parse_command(EnableDebuggingCommand, "--enable-debugging", "Enable job flow debugging (you must be signed up to SimpleDB for this to work)")
1415
+
1416
+ opts.separator "\n Adding Steps from a Json File to Job Flows\n"
1417
+
1418
+ commands.parse_command(JsonStepCommand, "--json FILE", "Add a sequence of steps stored in the json file FILE")
1419
+ commands.parse_options(["--json"], [
1420
+ [ ParamOption, "--param VARIABLE=VALUE ARGS", "Substitute the string VARIABLE with the string VALUE in the json file", :variables ],
1421
+ ])
1422
+
1423
+ opts.separator "\n Pig Steps\n"
1424
+
1425
+ commands.parse_command(PigScriptCommand, "--pig-script [SCRIPT]", "Add a step that runs a Pig script")
1426
+ commands.parse_command(PigInteractiveCommand, "--pig-interactive", "Add a step that sets up the job flow for an interactive (via SSH) pig session")
1427
+
1428
+ opts.separator "\n Hive Steps\n"
1429
+
1430
+ commands.parse_command(HiveScriptCommand, "--hive-script [SCRIPT]", "Add a step that runs a Hive script")
1431
+ commands.parse_command(HiveInteractiveCommand, "--hive-interactive", "Add a step that sets up the job flow for an interactive (via SSH) hive session")
1432
+ commands.parse_command(HiveSiteCommand, "--hive-site HIVE_SITE", "Override Hive configuration with configuration from HIVE_SITE")
1433
+ commands.parse_options(["--hive-script", "--hive-interactive", "--hive-site"], [
1434
+ [ OptionWithArg, "--hive-versions VERSIONS", "A comma separated list of Hive version", :hive_versions],
1435
+ ])
1436
+
1437
+ opts.separator "\n Adding Jar Steps to Job Flows\n"
1438
+
1439
+ commands.parse_command(JarStepCommand, "--jar JAR", "Run a Hadoop Jar in a step")
1440
+ commands.parse_options(["--jar"], [
1441
+ [ OptionWithArg, "--main-class MAIN_CLASS", "The main class of the jar", :main_class ]
1442
+ ])
1443
+
1444
+ opts.separator "\n Adding Streaming Steps to Job Flows\n"
1445
+
1446
+ commands.parse_command(StreamStepCommand, "--stream", "Add a step that performs hadoop streaming")
1447
+ commands.parse_options(["--stream"], [
1448
+ [ OptionWithArg, "--input INPUT", "Input to the steps, e.g. s3n://mybucket/input", :input],
1449
+ [ OptionWithArg, "--output OUTPUT", "The output to the steps, e.g. s3n://mybucket/output", :output],
1450
+ [ OptionWithArg, "--mapper MAPPER", "The mapper program or class", :mapper],
1451
+ [ OptionWithArg, "--cache CACHE_FILE", "A file to load into the cache, e.g. s3n://mybucket/sample.py#sample.py", :cache ],
1452
+ [ OptionWithArg, "--cache-archive CACHE_FILE", "A file to unpack into the cache, e.g. s3n://mybucket/sample.jar", :cache_archive, ],
1453
+ [ OptionWithArg, "--jobconf KEY=VALUE", "Specify jobconf arguments to pass to streaming, e.g. mapred.task.timeout=800000", :jobconf],
1454
+ [ OptionWithArg, "--reducer REDUCER", "The reducer program or class", :reducer],
1455
+ ])
1456
+
1457
+ opts.separator "\n Adding and Modifying Instance Groups\n"
1458
+
1459
+ commands.parse_command(ModifyInstanceGroupCommand, "--modify-instance-group INSTANCE_GROUP", "Modify an existing instance group")
1460
+ commands.parse_command(AddInstanceGroupCommand, "--add-instance-group ROLE", "Add an instance group to an existing jobflow")
1461
+ commands.parse_command(UnarrestInstanceGroupCommand, "--unarrest-instance-group ROLE", "Unarrest an instance group of the supplied jobflow")
1462
+ commands.parse_options(["--instance-group", "--modify-instance-group", "--add-instance-group", "--create"], [
1463
+ [ InstanceCountOption, "--instance-count INSTANCE_COUNT", "Set the instance count of an instance group", :instance_count ]
1464
+ ])
1465
+ commands.parse_options(["--instance-group", "--add-instance-group", "--create"], [
1466
+ [ InstanceTypeOption, "--instance-type INSTANCE_TYPE", "Set the instance type of an instance group", :instance_type ],
1467
+ ])
1468
+
1469
+ opts.separator "\n Contacting the Master Node\n"
1470
+
1471
+ # commands.parse_options(["--ssh", "--scp", "--eip"], [
1472
+ # [ FlagOption, "--no-wait", "Don't wait for the Master node to start before executing scp or ssh or assigning EIP", :no_wait ],
1473
+ # ])
1474
+
1475
+ commands.parse_command(SSHCommand, "--ssh [COMMAND]", "SSH to the master node and optionally run a command")
1476
+ commands.parse_command(PutCommand, "--put SRC", "Copy a file to the job flow using scp")
1477
+ commands.parse_command(GetCommand, "--get SRC", "Copy a file from the job flow using scp")
1478
+ commands.parse_command(PutCommand, "--scp SRC", "Copy a file to the job flow using scp")
1479
+
1480
+ commands.parse_options(["--get", "--put", "--scp"], [
1481
+ [ OptionWithArg, "--to DEST", "Destination location when copying files", :dest ],
1482
+ ])
1483
+
1484
+ commands.parse_command(LogsCommand, "--logs", "Display the step logs for the last executed step")
1485
+
1486
+ opts.separator "\n Assigning Elastic IP to Master Node\n"
1487
+
1488
+ commands.parse_command(EipCommand, "--eip [ElasticIP]", "Associate ElasticIP to master node. If no ElasticIP is specified, allocate and associate a new one.")
1489
+
1490
+ opts.separator "\n Settings common to all step types\n"
1491
+
1492
+ commands.parse_options(["--ssh", "--scp", "--eip"], [
1493
+ [ FlagOption, "--no-wait", "Don't wait for the Master node to start before executing scp or ssh or assigning EIP", :no_wait ],
1494
+ [ GlobalOption, "--key-pair-file FILE_PATH", "Path to your local pem file for your EC2 key pair", :key_pair_file ],
1495
+ ])
1496
+
1497
+ opts.separator "\n Specifying Bootstrap Actions\n"
1498
+
1499
+ commands.parse_command(BootstrapActionCommand, "--bootstrap-action SCRIPT", "Run a bootstrap action script on all instances")
1500
+ commands.parse_options(["--bootstrap-action"], [
1501
+ [ OptionWithArg, "--bootstrap-name NAME", "Set the name of the bootstrap action", :bootstrap_name ],
1502
+ ])
1503
+
1504
+
1505
+ opts.separator "\n Listing and Describing Job flows\n"
1506
+
1507
+ commands.parse_command(ListActionCommand, "--list", "List all job flows created in the last 2 days")
1508
+ commands.parse_command(DescribeActionCommand, "--describe", "Dump a JSON description of the supplied job flows")
1509
+ commands.parse_command(PrintHiveVersionCommand, "--print-hive-version", "Prints the version of Hive that's currently active on the job flow")
1510
+ commands.parse_options(["--list", "--describe"], [
1511
+ [ OptionWithArg, "--state NAME", "Set the name of the bootstrap action", :state ],
1512
+ [ FlagOption, "--active", "List running, starting or shutting down job flows", :active ],
1513
+ [ FlagOption, "--all", "List all job flows in the last 2 months", :all ],
1514
+ [ FlagOption, "--no-steps", "Do not list steps when listing jobs", :no_steps ],
1515
+ ])
1516
+
1517
+ opts.separator "\n Terminating Job Flows\n"
1518
+
1519
+ commands.parse_command(SetTerminationProtection, "--set-termination-protection BOOL", "Enable or disable job flow termination protection. Either true or false")
1520
+
1521
+ commands.parse_command(TerminateActionCommand, "--terminate", "Terminate job flows")
1522
+
1523
+ opts.separator "\n Common Options\n"
1524
+
1525
+ commands.parse_options(["--jobflow", "--describe"], [
1526
+ [ GlobalOption, "--jobflow JOB_FLOW_ID", "The job flow to act on", :jobflow, /^j-[A-Z0-9]+$/],
1527
+ ])
1528
+
1529
+ commands.parse_options(:global, [
1530
+ [ GlobalFlagOption, "--verbose", "Turn on verbose logging of program interaction", :verbose ],
1531
+ [ GlobalFlagOption, "--trace", "Trace commands made to the webservice", :trace ],
1532
+ [ GlobalOption, "--credentials CRED_FILE", "File containing access-id and private-key", :credentials],
1533
+ [ GlobalOption, "--access-id ACCESS_ID", "AWS Access Id", :aws_access_id],
1534
+ [ GlobalOption, "--private-key PRIVATE_KEY", "AWS Private Key", :aws_secret_key],
1535
+ [ GlobalOption, "--log-uri LOG_URI", "Location in S3 to store logs from the job flow, e.g. s3n://mybucket/logs", :log_uri ],
1536
+ ])
1537
+ commands.parse_command(VersionCommand, "--version", "Print version string")
1538
+ commands.parse_command(HelpCommand, "--help", "Show help message")
1539
+
1540
+ opts.separator "\n Uncommon Options\n"
1541
+
1542
+ commands.parse_options(:global, [
1543
+ [ GlobalFlagOption, "--debug", "Print stack traces when exceptions occur", :debug],
1544
+ [ GlobalOption, "--endpoint ENDPOINT", "File containing access-id and private-key", :endpoint],
1545
+ [ GlobalOption, "--region REGION", "The region to use for the endpoint", :region],
1546
+ [ GlobalOption, "--apps-path APPS_PATH", "Specify s3:// path to the base of the emr public bucket to use. e.g s3://us-east-1.elasticmapreduce", :apps_path],
1547
+ [ GlobalOption, "--beta-path BETA_PATH", "Specify s3:// path to the base of the emr public bucket to use for beta apps. e.g s3://beta.elasticmapreduce", :beta_path],
1548
+ ])
1549
+
1550
+ opts.separator "\n Short Options\n"
1551
+ commands.parse_command(HelpCommand, "-h", "Show help message")
1552
+ commands.parse_options(:global, [
1553
+ [ GlobalFlagOption, "-v", "Turn on verbose logging of program interaction", :verbose ],
1554
+ [ GlobalOption, "-c CRED_FILE", "File containing access-id and private-key", :credentials ],
1555
+ [ GlobalOption, "-a ACCESS_ID", "AWS Access Id", :aws_access_id],
1556
+ [ GlobalOption, "-p PRIVATE_KEY", "AWS Private Key", :aws_secret_key],
1557
+ [ GlobalOption, "-j JOB_FLOW_ID", "The job flow to act on", :jobflow, /^j-[A-Z0-9]+$/],
1558
+ ])
1559
+
1560
+ end
1561
+
1562
+ def self.is_create_child_command(cmd)
1563
+ return cmd.is_a?(StepCommand) ||
1564
+ cmd.is_a?(BootstrapActionCommand) ||
1565
+ cmd.is_a?(AddInstanceGroupCommand) ||
1566
+ cmd.is_a?(CreateInstanceGroupCommand)
1567
+ end
1568
+
1569
+ # this function pull out steps if there is a create command that preceeds them
1570
+ def self.fold_commands(commands)
1571
+ last_create_command = nil
1572
+ new_commands = []
1573
+ for cmd in commands do
1574
+ if cmd.is_a?(CreateJobFlowCommand) then
1575
+ last_create_command = cmd
1576
+ elsif is_create_child_command(cmd) then
1577
+ if last_create_command == nil then
1578
+ if cmd.is_a?(StepCommand) then
1579
+ last_create_command = AddJobFlowStepsCommand.new(
1580
+ "--add-steps", "Add job flow steps", nil, commands
1581
+ )
1582
+ new_commands << last_create_command
1583
+ elsif cmd.is_a?(BootstrapActionCommand) then
1584
+ raise RuntimeError, "the option #{cmd.name} must come after the --create option"
1585
+ elsif cmd.is_a?(CreateInstanceGroupCommand) then
1586
+ raise RuntimeError, "the option #{cmd.name} must come after the --create option"
1587
+ elsif cmd.is_a?(AddInstanceGroupCommand) then
1588
+ new_commands << cmd
1589
+ next
1590
+ else
1591
+ next
1592
+ end
1593
+ end
1594
+
1595
+ if cmd.is_a?(StepCommand) then
1596
+ if ! last_create_command.respond_to?(:add_step_command) then
1597
+ last_create_command = AddJobFlowStepsCommand.new(
1598
+ "--add-steps", "Add job flow steps", nil, commands
1599
+ )
1600
+ end
1601
+ last_create_command.add_step_command(cmd)
1602
+ elsif cmd.is_a?(BootstrapActionCommand) then
1603
+ if ! last_create_command.respond_to?(:add_bootstrap_command) then
1604
+ raise RuntimeError, "Bootstrap actions must follow a --create command"
1605
+ end
1606
+ last_create_command.add_bootstrap_command(cmd)
1607
+ elsif cmd.is_a?(CreateInstanceGroupCommand) || cmd.is_a?(AddInstanceGroupCommand) then
1608
+ if last_create_command.respond_to?(:add_instance_group_command) then
1609
+ last_create_command.add_instance_group_command(cmd)
1610
+ else
1611
+ new_commands << cmd
1612
+ end
1613
+ else
1614
+ raise RuntimeError, "Unknown child command #{cmd.name} following #{last_create_command.name}"
1615
+ end
1616
+ next
1617
+ end
1618
+ new_commands << cmd
1619
+ end
1620
+
1621
+ commands.commands = new_commands
1622
+ end
1623
+
1624
+ def self.create_and_execute_commands(args, client_class, logger, executor, exit_on_error=true)
1625
+ commands = Commands.new(logger, executor)
1626
+
1627
+ begin
1628
+ opts = OptionParser.new do |opts|
1629
+ add_commands(commands, opts)
1630
+ end
1631
+ opts.parse!(args)
1632
+
1633
+ if commands.get_field(:trace) then
1634
+ logger.level = :trace
1635
+ end
1636
+
1637
+ commands.parse_jobflows(args)
1638
+
1639
+ if commands.commands.size == 0 then
1640
+ commands.commands << HelpCommand.new("--help", "Print help text", nil, commands)
1641
+ end
1642
+
1643
+ credentials = Credentials.new(commands)
1644
+ credentials.parse_credentials(commands.get_field(:credentials, "credentials.json"),
1645
+ commands.global_options)
1646
+
1647
+ work_out_globals(commands)
1648
+ fold_commands(commands)
1649
+ commands.validate
1650
+ client = EmrClient.new(commands, logger, client_class)
1651
+ commands.enact(client)
1652
+ rescue RuntimeError => e
1653
+ logger.puts("Error: " + e.message)
1654
+ if commands.get_field(:trace) then
1655
+ logger.puts(e.backtrace.join("\n"))
1656
+ end
1657
+ if exit_on_error then
1658
+ exit(-1)
1659
+ else
1660
+ raise e
1661
+ end
1662
+ end
1663
+ return commands
1664
+ end
1665
+
1666
+ def self.work_out_globals(commands)
1667
+ options = commands.global_options
1668
+ if commands.have(:region) then
1669
+ if commands.have(:endpoint) then
1670
+ raise RuntimeError, "You may not specify --region together with --endpoint"
1671
+ end
1672
+
1673
+ endpoint = "https://#{options[:region]}.elasticmapreduce.amazonaws.com"
1674
+ commands.global_options[:endpoint] = endpoint
1675
+ end
1676
+
1677
+ if commands.have(:endpoint) then
1678
+ region_match = commands.get_field(:endpoint).match("^https*://(.*)\.elasticmapreduce")
1679
+ if ! commands.have(:apps_path) && region_match != nil then
1680
+ options[:apps_path] = "s3://#{region_match[1]}.elasticmapreduce"
1681
+ end
1682
+ end
1683
+
1684
+ options[:apps_path] ||= "s3://us-east-1.elasticmapreduce"
1685
+ options[:beta_path] ||= "s3://beta.elasticmapreduce"
1686
+ for key in [:apps_path, :beta_path] do
1687
+ options[key].chomp!("/")
1688
+ end
1689
+ end
1690
+ end