elasticity 1.5 → 2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/.rspec +2 -1
  2. data/.rvmrc +1 -1
  3. data/HISTORY.md +47 -24
  4. data/LICENSE +1 -1
  5. data/README.md +165 -317
  6. data/Rakefile +4 -3
  7. data/elasticity.gemspec +3 -5
  8. data/lib/elasticity.rb +10 -5
  9. data/lib/elasticity/aws_request.rb +81 -20
  10. data/lib/elasticity/custom_jar_step.rb +33 -0
  11. data/lib/elasticity/emr.rb +45 -117
  12. data/lib/elasticity/hadoop_bootstrap_action.rb +27 -0
  13. data/lib/elasticity/hive_step.rb +57 -0
  14. data/lib/elasticity/job_flow.rb +109 -39
  15. data/lib/elasticity/job_flow_status.rb +53 -0
  16. data/lib/elasticity/job_flow_status_step.rb +35 -0
  17. data/lib/elasticity/job_flow_step.rb +17 -25
  18. data/lib/elasticity/pig_step.rb +82 -0
  19. data/lib/elasticity/support/conditional_raise.rb +23 -0
  20. data/lib/elasticity/version.rb +1 -1
  21. data/spec/lib/elasticity/aws_request_spec.rb +159 -51
  22. data/spec/lib/elasticity/custom_jar_step_spec.rb +59 -0
  23. data/spec/lib/elasticity/emr_spec.rb +231 -762
  24. data/spec/lib/elasticity/hadoop_bootstrap_action_spec.rb +26 -0
  25. data/spec/lib/elasticity/hive_step_spec.rb +74 -0
  26. data/spec/lib/elasticity/job_flow_integration_spec.rb +197 -0
  27. data/spec/lib/elasticity/job_flow_spec.rb +369 -138
  28. data/spec/lib/elasticity/job_flow_status_spec.rb +147 -0
  29. data/spec/lib/elasticity/job_flow_status_step_spec.rb +73 -0
  30. data/spec/lib/elasticity/job_flow_step_spec.rb +26 -64
  31. data/spec/lib/elasticity/pig_step_spec.rb +104 -0
  32. data/spec/lib/elasticity/support/conditional_raise_spec.rb +35 -0
  33. data/spec/spec_helper.rb +1 -50
  34. data/spec/support/be_a_hash_including_matcher.rb +35 -0
  35. metadata +101 -119
  36. data/.autotest +0 -2
  37. data/lib/elasticity/custom_jar_job.rb +0 -38
  38. data/lib/elasticity/hive_job.rb +0 -69
  39. data/lib/elasticity/pig_job.rb +0 -109
  40. data/lib/elasticity/simple_job.rb +0 -51
  41. data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_successful.yml +0 -44
  42. data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_unsuccessful.yml +0 -41
  43. data/spec/fixtures/vcr_cassettes/add_jobflow_steps/add_multiple_steps.yml +0 -266
  44. data/spec/fixtures/vcr_cassettes/custom_jar_job/cloudburst.yml +0 -41
  45. data/spec/fixtures/vcr_cassettes/describe_jobflows/all_jobflows.yml +0 -75
  46. data/spec/fixtures/vcr_cassettes/direct/terminate_jobflow.yml +0 -38
  47. data/spec/fixtures/vcr_cassettes/hive_job/hive_ads.yml +0 -41
  48. data/spec/fixtures/vcr_cassettes/modify_instance_groups/set_instances_to_3.yml +0 -38
  49. data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports.yml +0 -41
  50. data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports_with_bootstrap.yml +0 -41
  51. data/spec/fixtures/vcr_cassettes/run_jobflow/word_count.yml +0 -41
  52. data/spec/fixtures/vcr_cassettes/set_termination_protection/nonexistent_job_flows.yml +0 -41
  53. data/spec/fixtures/vcr_cassettes/set_termination_protection/protect_multiple_job_flows.yml +0 -38
  54. data/spec/fixtures/vcr_cassettes/terminate_jobflows/one_jobflow.yml +0 -38
  55. data/spec/lib/elasticity/custom_jar_job_spec.rb +0 -118
  56. data/spec/lib/elasticity/hive_job_spec.rb +0 -90
  57. data/spec/lib/elasticity/pig_job_spec.rb +0 -226
@@ -0,0 +1,27 @@
1
+ module Elasticity
2
+
3
+ class HadoopBootstrapAction
4
+
5
+ attr_accessor :name
6
+ attr_accessor :option
7
+ attr_accessor :value
8
+
9
+ def initialize(option, value)
10
+ @name = 'Elasticity Bootstrap Action (Configure Hadoop)'
11
+ @option = option
12
+ @value = value
13
+ end
14
+
15
+ def to_aws_bootstrap_action
16
+ {
17
+ :name => @name,
18
+ :script_bootstrap_action => {
19
+ :path => 's3n://elasticmapreduce/bootstrap-actions/configure-hadoop',
20
+ :args => [@option, @value]
21
+ }
22
+ }
23
+ end
24
+
25
+ end
26
+
27
+ end
@@ -0,0 +1,57 @@
1
+ module Elasticity
2
+
3
+ class HiveStep
4
+
5
+ include JobFlowStep
6
+
7
+ attr_accessor :name
8
+ attr_accessor :script
9
+ attr_accessor :variables
10
+ attr_accessor :action_on_failure
11
+
12
+ def initialize(script)
13
+ @name = "Elasticity Hive Step (#{script})"
14
+ @script = script
15
+ @variables = { }
16
+ @action_on_failure = 'TERMINATE_JOB_FLOW'
17
+ end
18
+
19
+ def to_aws_step(job_flow)
20
+ args = %w(s3://elasticmapreduce/libs/hive/hive-script --run-hive-script --args)
21
+ args.concat(['-f', @script])
22
+ @variables.keys.sort.each do |name|
23
+ args.concat(['-d', "#{name}=#{@variables[name]}"])
24
+ end
25
+ {
26
+ :name => @name,
27
+ :action_on_failure => @action_on_failure,
28
+ :hadoop_jar_step => {
29
+ :jar => 's3://elasticmapreduce/libs/script-runner/script-runner.jar',
30
+ :args => args
31
+ }
32
+ }
33
+ end
34
+
35
+ def self.requires_installation?
36
+ true
37
+ end
38
+
39
+ def self.aws_installation_step
40
+ {
41
+ :action_on_failure => 'TERMINATE_JOB_FLOW',
42
+ :hadoop_jar_step => {
43
+ :jar => 's3://elasticmapreduce/libs/script-runner/script-runner.jar',
44
+ :args => [
45
+ 's3://elasticmapreduce/libs/hive/hive-script',
46
+ '--base-path',
47
+ 's3://elasticmapreduce/libs/hive/',
48
+ '--install-hive'
49
+ ],
50
+ },
51
+ :name => 'Elasticity - Install Hive'
52
+ }
53
+ end
54
+
55
+ end
56
+
57
+ end
@@ -1,51 +1,121 @@
1
1
  module Elasticity
2
2
 
3
+ class JobFlowRunningError < StandardError; end
4
+ class JobFlowNotStartedError < StandardError; end
5
+ class JobFlowMissingStepsError < StandardError; end
6
+
3
7
  class JobFlow
4
8
 
9
+ attr_accessor :action_on_failure
10
+ attr_accessor :ec2_key_name
5
11
  attr_accessor :name
6
- attr_accessor :jobflow_id
7
- attr_accessor :state
8
- attr_accessor :steps
9
- attr_accessor :created_at
10
- attr_accessor :started_at
11
- attr_accessor :ready_at
12
+ attr_accessor :hadoop_version
12
13
  attr_accessor :instance_count
14
+ attr_accessor :log_uri
13
15
  attr_accessor :master_instance_type
14
16
  attr_accessor :slave_instance_type
15
- attr_accessor :last_state_change_reason
16
-
17
- def initialize
18
- @steps = []
19
- end
20
-
21
- # Create a jobflow from an AWS <member> (Nokogiri::XML::Element):
22
- # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member
23
- def self.from_member_element(xml_element)
24
- jobflow = JobFlow.new
25
- jobflow.name = xml_element.xpath("./Name").text.strip
26
- jobflow.jobflow_id = xml_element.xpath("./JobFlowId").text.strip
27
- jobflow.state = xml_element.xpath("./ExecutionStatusDetail/State").text.strip
28
- jobflow.last_state_change_reason = xml_element.xpath("./ExecutionStatusDetail/LastStateChangeReason").text.strip
29
- jobflow.steps = JobFlowStep.from_members_nodeset(xml_element.xpath("./Steps/member"))
30
- jobflow.created_at = Time.parse(xml_element.xpath("./ExecutionStatusDetail/CreationDateTime").text.strip)
31
- started_at = xml_element.xpath("./ExecutionStatusDetail/StartDateTime").text.strip
32
- jobflow.started_at = (started_at == "") ? (nil) : (Time.parse(started_at))
33
- ready_at = xml_element.xpath("./ExecutionStatusDetail/ReadyDateTime").text.strip
34
- jobflow.ready_at = (ready_at == "") ? (nil) : (Time.parse(ready_at))
35
- jobflow.instance_count = xml_element.xpath("./Instances/InstanceCount").text.strip
36
- jobflow.master_instance_type = xml_element.xpath("./Instances/MasterInstanceType").text.strip
37
- jobflow.slave_instance_type = xml_element.xpath("./Instances/SlaveInstanceType").text.strip
38
- jobflow
39
- end
40
-
41
- # Create JobFlows from a collection of AWS <member> nodes (Nokogiri::XML::NodeSet):
42
- # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows
43
- def self.from_members_nodeset(members_nodeset)
44
- jobflows = []
45
- members_nodeset.each do |member|
46
- jobflows << from_member_element(member)
17
+ attr_accessor :ami_version
18
+ attr_accessor :keep_job_flow_alive_when_no_steps
19
+ attr_accessor :ec2_subnet_id
20
+
21
+ def initialize(access, secret)
22
+ @action_on_failure = 'TERMINATE_JOB_FLOW'
23
+ @ec2_key_name = 'default'
24
+ @hadoop_version = '0.20.205'
25
+ @instance_count = 2
26
+ @master_instance_type = 'm1.small'
27
+ @name = 'Elasticity Job Flow'
28
+ @slave_instance_type = 'm1.small'
29
+ @ami_version = 'latest'
30
+ @keep_job_flow_alive_when_no_steps = false
31
+
32
+ @emr = Elasticity::EMR.new(access, secret)
33
+
34
+ @bootstrap_actions = []
35
+ @jobflow_steps = []
36
+ @installed_steps = []
37
+ end
38
+
39
+ def instance_count=(count)
40
+ raise ArgumentError, 'Instance count cannot be set to less than 2 (requested 1)' unless count > 1
41
+ @instance_count = count
42
+ end
43
+
44
+ def add_bootstrap_action(bootstrap_action)
45
+ raise_if is_jobflow_running?, JobFlowRunningError, 'To modify bootstrap actions, please create a new job flow.'
46
+ @bootstrap_actions << bootstrap_action
47
+ end
48
+
49
+ def add_step(jobflow_step)
50
+ if is_jobflow_running?
51
+ jobflow_steps = []
52
+ if jobflow_step.class.send(:requires_installation?) && !@installed_steps.include?(jobflow_step.class)
53
+ jobflow_steps << jobflow_step.class.send(:aws_installation_step)
54
+ end
55
+ jobflow_steps << jobflow_step.to_aws_step(self)
56
+ @emr.add_jobflow_steps(@jobflow_id, {:steps => jobflow_steps})
57
+ else
58
+ @jobflow_steps << jobflow_step
59
+ end
60
+ end
61
+
62
+ def run
63
+ raise_if @jobflow_steps.empty?, JobFlowMissingStepsError, 'Cannot run a job flow without adding steps. Please use #add_step.'
64
+ raise_if is_jobflow_running?, JobFlowRunningError, 'Cannot run a job flow multiple times. To do more with this job flow, please use #add_step.'
65
+ @jobflow_id ||= @emr.run_job_flow(jobflow_config)
66
+ end
67
+
68
+ def shutdown
69
+ raise_unless is_jobflow_running?, JobFlowNotStartedError, 'Cannot #shutdown a job flow that has not yet been #run.'
70
+ @emr.terminate_jobflows(@jobflow_id)
71
+ end
72
+
73
+ def status
74
+ raise_unless is_jobflow_running?, JobFlowNotStartedError, 'Please #run this job flow before attempting to retrieve status.'
75
+ @emr.describe_jobflow(@jobflow_id)
76
+ end
77
+
78
+ private
79
+
80
+ def is_jobflow_running?
81
+ @jobflow_id
82
+ end
83
+
84
+ def jobflow_config
85
+ config = jobflow_preamble
86
+ config[:steps] = jobflow_steps
87
+ config[:log_uri] = @log_uri if @log_uri
88
+ config[:bootstrap_actions] = @bootstrap_actions.map{|a| a.to_aws_bootstrap_action} unless @bootstrap_actions.empty?
89
+ config
90
+ end
91
+
92
+ def jobflow_preamble
93
+ {
94
+ :name => @name,
95
+ :ami_version => @ami_version,
96
+ :instances => {
97
+ :keep_job_flow_alive_when_no_steps => @keep_job_flow_alive_when_no_steps,
98
+ :ec2_key_name => @ec2_key_name,
99
+ :hadoop_version => @hadoop_version,
100
+ :instance_count => @instance_count,
101
+ :master_instance_type => @master_instance_type,
102
+ :slave_instance_type => @slave_instance_type,
103
+ }
104
+ }.tap do |preamble|
105
+ preamble.merge!(:ec2_subnet_id => @ec2_subnet_id) if @ec2_subnet_id
106
+ end
107
+ end
108
+
109
+ def jobflow_steps
110
+ steps = []
111
+ @jobflow_steps.each do |step|
112
+ if step.class.send(:requires_installation?) && !@installed_steps.include?(step.class)
113
+ steps << step.class.send(:aws_installation_step)
114
+ @installed_steps << step.class
115
+ end
116
+ steps << step.to_aws_step(self)
47
117
  end
48
- jobflows
118
+ steps
49
119
  end
50
120
 
51
121
  end
@@ -0,0 +1,53 @@
1
+ module Elasticity
2
+
3
+ class JobFlowStatus
4
+
5
+ attr_accessor :name
6
+ attr_accessor :jobflow_id
7
+ attr_accessor :state
8
+ attr_accessor :steps
9
+ attr_accessor :created_at
10
+ attr_accessor :started_at
11
+ attr_accessor :ready_at
12
+ attr_accessor :instance_count
13
+ attr_accessor :master_instance_type
14
+ attr_accessor :slave_instance_type
15
+ attr_accessor :last_state_change_reason
16
+
17
+ def initialize
18
+ @steps = []
19
+ end
20
+
21
+ # Create a jobflow from an AWS <member> (Nokogiri::XML::Element):
22
+ # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member
23
+ def self.from_member_element(xml_element)
24
+ jobflow = JobFlowStatus.new
25
+ jobflow.name = xml_element.xpath('./Name').text.strip
26
+ jobflow.jobflow_id = xml_element.xpath('./JobFlowId').text.strip
27
+ jobflow.state = xml_element.xpath('./ExecutionStatusDetail/State').text.strip
28
+ jobflow.last_state_change_reason = xml_element.xpath('./ExecutionStatusDetail/LastStateChangeReason').text.strip
29
+ jobflow.steps = JobFlowStatusStep.from_members_nodeset(xml_element.xpath('./Steps/member'))
30
+ jobflow.created_at = Time.parse(xml_element.xpath('./ExecutionStatusDetail/CreationDateTime').text.strip)
31
+ started_at = xml_element.xpath('./ExecutionStatusDetail/StartDateTime').text.strip
32
+ jobflow.started_at = (started_at == '') ? (nil) : (Time.parse(started_at))
33
+ ready_at = xml_element.xpath('./ExecutionStatusDetail/ReadyDateTime').text.strip
34
+ jobflow.ready_at = (ready_at == '') ? (nil) : (Time.parse(ready_at))
35
+ jobflow.instance_count = xml_element.xpath('./Instances/InstanceCount').text.strip
36
+ jobflow.master_instance_type = xml_element.xpath('./Instances/MasterInstanceType').text.strip
37
+ jobflow.slave_instance_type = xml_element.xpath('./Instances/SlaveInstanceType').text.strip
38
+ jobflow
39
+ end
40
+
41
+ # Create JobFlows from a collection of AWS <member> nodes (Nokogiri::XML::NodeSet):
42
+ # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows
43
+ def self.from_members_nodeset(members_nodeset)
44
+ jobflows = []
45
+ members_nodeset.each do |member|
46
+ jobflows << from_member_element(member)
47
+ end
48
+ jobflows
49
+ end
50
+
51
+ end
52
+
53
+ end
@@ -0,0 +1,35 @@
1
+ module Elasticity
2
+
3
+ class JobFlowStatusStep
4
+
5
+ attr_accessor :name
6
+ attr_accessor :state
7
+ attr_accessor :started_at
8
+ attr_accessor :ended_at
9
+
10
+ # Create a job flow from an AWS <member> (Nokogiri::XML::Element):
11
+ # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member/Steps/member
12
+ def self.from_member_element(xml_element)
13
+ job_flow_step = JobFlowStatusStep.new
14
+ job_flow_step.name = xml_element.xpath("./StepConfig/Name").text.strip
15
+ job_flow_step.state = xml_element.xpath("./ExecutionStatusDetail/State").text.strip
16
+ started_at = xml_element.xpath("./ExecutionStatusDetail/StartDateTime").text.strip
17
+ job_flow_step.started_at = (started_at == "") ? (nil) : (Time.parse(started_at))
18
+ ended_at = xml_element.xpath("./ExecutionStatusDetail/EndDateTime").text.strip
19
+ job_flow_step.ended_at = (ended_at == "") ? (nil) : (Time.parse(ended_at))
20
+ job_flow_step
21
+ end
22
+
23
+ # Create JobFlowSteps from a collection of AWS <member> nodes (Nokogiri::XML::NodeSet):
24
+ # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member/Steps/member
25
+ def self.from_members_nodeset(members_nodeset)
26
+ jobflow_steps = []
27
+ members_nodeset.each do |member|
28
+ jobflow_steps << from_member_element(member)
29
+ end
30
+ jobflow_steps
31
+ end
32
+
33
+ end
34
+
35
+ end
@@ -1,33 +1,25 @@
1
1
  module Elasticity
2
2
 
3
- class JobFlowStep
4
-
5
- attr_accessor :name
6
- attr_accessor :state
7
- attr_accessor :started_at
8
- attr_accessor :ended_at
9
-
10
- # Create a job flow from an AWS <member> (Nokogiri::XML::Element):
11
- # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member/Steps/member
12
- def self.from_member_element(xml_element)
13
- job_flow_step = JobFlowStep.new
14
- job_flow_step.name = xml_element.xpath("./StepConfig/Name").text.strip
15
- job_flow_step.state = xml_element.xpath("./ExecutionStatusDetail/State").text.strip
16
- started_at = xml_element.xpath("./ExecutionStatusDetail/StartDateTime").text.strip
17
- job_flow_step.started_at = (started_at == "") ? (nil) : (Time.parse(started_at))
18
- ended_at = xml_element.xpath("./ExecutionStatusDetail/EndDateTime").text.strip
19
- job_flow_step.ended_at = (ended_at == "") ? (nil) : (Time.parse(ended_at))
20
- job_flow_step
3
+ module JobFlowStep
4
+
5
+ def to_aws_step(jobflow_step)
6
+ raise RuntimeError, '#to_aws_step is required to be defined on all job flow steps.'
21
7
  end
22
8
 
23
- # Create JobFlowSteps from a collection of AWS <member> nodes (Nokogiri::XML::NodeSet):
24
- # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member/Steps/member
25
- def self.from_members_nodeset(members_nodeset)
26
- jobflow_steps = []
27
- members_nodeset.each do |member|
28
- jobflow_steps << from_member_element(member)
9
+ module ClassMethods
10
+
11
+ def requires_installation?
12
+ false
13
+ end
14
+
15
+ def aws_installation_step
16
+ raise RuntimeError, '.aws_installation_step is required to be defined when a step requires installation (e.g. Pig, Hive).'
29
17
  end
30
- jobflow_steps
18
+
19
+ end
20
+
21
+ def self.included(base)
22
+ base.extend(ClassMethods)
31
23
  end
32
24
 
33
25
  end
@@ -0,0 +1,82 @@
1
+ module Elasticity
2
+
3
+ class PigStep
4
+
5
+ include JobFlowStep
6
+
7
+ attr_accessor :name
8
+ attr_accessor :script
9
+ attr_accessor :variables
10
+ attr_accessor :action_on_failure
11
+
12
+ def initialize(script)
13
+ @name = "Elasticity Pig Step (#{script})"
14
+ @script = script
15
+ @variables = { }
16
+ @action_on_failure = 'TERMINATE_JOB_FLOW'
17
+ end
18
+
19
+ def to_aws_step(job_flow)
20
+ args = %w(s3://elasticmapreduce/libs/pig/pig-script --run-pig-script --args)
21
+ @variables.keys.sort.each do |name|
22
+ args.concat(['-p', "#{name}=#{@variables[name]}"])
23
+ end
24
+ args.concat(['-p', "E_PARALLELS=#{parallels(job_flow.slave_instance_type, job_flow.instance_count)}"])
25
+ args << @script
26
+ {
27
+ :action_on_failure => @action_on_failure,
28
+ :hadoop_jar_step => {
29
+ :jar => 's3://elasticmapreduce/libs/script-runner/script-runner.jar',
30
+ :args => args,
31
+ },
32
+ :name => @name
33
+ }
34
+ end
35
+
36
+ def self.requires_installation?
37
+ true
38
+ end
39
+
40
+ def self.aws_installation_step
41
+ {
42
+ :action_on_failure => 'TERMINATE_JOB_FLOW',
43
+ :hadoop_jar_step => {
44
+ :jar => 's3://elasticmapreduce/libs/script-runner/script-runner.jar',
45
+ :args => [
46
+ 's3://elasticmapreduce/libs/pig/pig-script',
47
+ '--base-path',
48
+ 's3://elasticmapreduce/libs/pig/',
49
+ '--install-pig'
50
+ ],
51
+ },
52
+ :name => 'Elasticity - Install Pig'
53
+ }
54
+ end
55
+
56
+ private
57
+
58
+ # Calculate a common-sense default value of PARALLELS using the following
59
+ # formula from the Pig Cookbook:
60
+ #
61
+ # <num machines> * <num reduce slots per machine> * 0.9
62
+ #
63
+ # With the following reducer configuration (from an AWS forum post):
64
+ #
65
+ # m1.small 1
66
+ # m1.large 2
67
+ # m1.xlarge 4
68
+ # c1.medium 2
69
+ # c1.xlarge 4
70
+ def parallels(slave_instance_type, instance_count)
71
+ reduce_slots = Hash.new(1)
72
+ reduce_slots['m1.small'] = 1
73
+ reduce_slots['m1.large'] = 2
74
+ reduce_slots['m1.xlarge'] = 4
75
+ reduce_slots['c1.medium'] = 2
76
+ reduce_slots['c1.xlarge'] = 4
77
+ ((instance_count - 1).to_f * reduce_slots[slave_instance_type].to_f * 0.9).ceil
78
+ end
79
+
80
+ end
81
+
82
+ end