elasticity 1.5 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/.rspec +2 -1
  2. data/.rvmrc +1 -1
  3. data/HISTORY.md +47 -24
  4. data/LICENSE +1 -1
  5. data/README.md +165 -317
  6. data/Rakefile +4 -3
  7. data/elasticity.gemspec +3 -5
  8. data/lib/elasticity.rb +10 -5
  9. data/lib/elasticity/aws_request.rb +81 -20
  10. data/lib/elasticity/custom_jar_step.rb +33 -0
  11. data/lib/elasticity/emr.rb +45 -117
  12. data/lib/elasticity/hadoop_bootstrap_action.rb +27 -0
  13. data/lib/elasticity/hive_step.rb +57 -0
  14. data/lib/elasticity/job_flow.rb +109 -39
  15. data/lib/elasticity/job_flow_status.rb +53 -0
  16. data/lib/elasticity/job_flow_status_step.rb +35 -0
  17. data/lib/elasticity/job_flow_step.rb +17 -25
  18. data/lib/elasticity/pig_step.rb +82 -0
  19. data/lib/elasticity/support/conditional_raise.rb +23 -0
  20. data/lib/elasticity/version.rb +1 -1
  21. data/spec/lib/elasticity/aws_request_spec.rb +159 -51
  22. data/spec/lib/elasticity/custom_jar_step_spec.rb +59 -0
  23. data/spec/lib/elasticity/emr_spec.rb +231 -762
  24. data/spec/lib/elasticity/hadoop_bootstrap_action_spec.rb +26 -0
  25. data/spec/lib/elasticity/hive_step_spec.rb +74 -0
  26. data/spec/lib/elasticity/job_flow_integration_spec.rb +197 -0
  27. data/spec/lib/elasticity/job_flow_spec.rb +369 -138
  28. data/spec/lib/elasticity/job_flow_status_spec.rb +147 -0
  29. data/spec/lib/elasticity/job_flow_status_step_spec.rb +73 -0
  30. data/spec/lib/elasticity/job_flow_step_spec.rb +26 -64
  31. data/spec/lib/elasticity/pig_step_spec.rb +104 -0
  32. data/spec/lib/elasticity/support/conditional_raise_spec.rb +35 -0
  33. data/spec/spec_helper.rb +1 -50
  34. data/spec/support/be_a_hash_including_matcher.rb +35 -0
  35. metadata +101 -119
  36. data/.autotest +0 -2
  37. data/lib/elasticity/custom_jar_job.rb +0 -38
  38. data/lib/elasticity/hive_job.rb +0 -69
  39. data/lib/elasticity/pig_job.rb +0 -109
  40. data/lib/elasticity/simple_job.rb +0 -51
  41. data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_successful.yml +0 -44
  42. data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_unsuccessful.yml +0 -41
  43. data/spec/fixtures/vcr_cassettes/add_jobflow_steps/add_multiple_steps.yml +0 -266
  44. data/spec/fixtures/vcr_cassettes/custom_jar_job/cloudburst.yml +0 -41
  45. data/spec/fixtures/vcr_cassettes/describe_jobflows/all_jobflows.yml +0 -75
  46. data/spec/fixtures/vcr_cassettes/direct/terminate_jobflow.yml +0 -38
  47. data/spec/fixtures/vcr_cassettes/hive_job/hive_ads.yml +0 -41
  48. data/spec/fixtures/vcr_cassettes/modify_instance_groups/set_instances_to_3.yml +0 -38
  49. data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports.yml +0 -41
  50. data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports_with_bootstrap.yml +0 -41
  51. data/spec/fixtures/vcr_cassettes/run_jobflow/word_count.yml +0 -41
  52. data/spec/fixtures/vcr_cassettes/set_termination_protection/nonexistent_job_flows.yml +0 -41
  53. data/spec/fixtures/vcr_cassettes/set_termination_protection/protect_multiple_job_flows.yml +0 -38
  54. data/spec/fixtures/vcr_cassettes/terminate_jobflows/one_jobflow.yml +0 -38
  55. data/spec/lib/elasticity/custom_jar_job_spec.rb +0 -118
  56. data/spec/lib/elasticity/hive_job_spec.rb +0 -90
  57. data/spec/lib/elasticity/pig_job_spec.rb +0 -226
@@ -0,0 +1,27 @@
1
+ module Elasticity
2
+
3
+ class HadoopBootstrapAction
4
+
5
+ attr_accessor :name
6
+ attr_accessor :option
7
+ attr_accessor :value
8
+
9
+ def initialize(option, value)
10
+ @name = 'Elasticity Bootstrap Action (Configure Hadoop)'
11
+ @option = option
12
+ @value = value
13
+ end
14
+
15
+ def to_aws_bootstrap_action
16
+ {
17
+ :name => @name,
18
+ :script_bootstrap_action => {
19
+ :path => 's3n://elasticmapreduce/bootstrap-actions/configure-hadoop',
20
+ :args => [@option, @value]
21
+ }
22
+ }
23
+ end
24
+
25
+ end
26
+
27
+ end
@@ -0,0 +1,57 @@
1
+ module Elasticity
2
+
3
+ class HiveStep
4
+
5
+ include JobFlowStep
6
+
7
+ attr_accessor :name
8
+ attr_accessor :script
9
+ attr_accessor :variables
10
+ attr_accessor :action_on_failure
11
+
12
+ def initialize(script)
13
+ @name = "Elasticity Hive Step (#{script})"
14
+ @script = script
15
+ @variables = { }
16
+ @action_on_failure = 'TERMINATE_JOB_FLOW'
17
+ end
18
+
19
+ def to_aws_step(job_flow)
20
+ args = %w(s3://elasticmapreduce/libs/hive/hive-script --run-hive-script --args)
21
+ args.concat(['-f', @script])
22
+ @variables.keys.sort.each do |name|
23
+ args.concat(['-d', "#{name}=#{@variables[name]}"])
24
+ end
25
+ {
26
+ :name => @name,
27
+ :action_on_failure => @action_on_failure,
28
+ :hadoop_jar_step => {
29
+ :jar => 's3://elasticmapreduce/libs/script-runner/script-runner.jar',
30
+ :args => args
31
+ }
32
+ }
33
+ end
34
+
35
+ def self.requires_installation?
36
+ true
37
+ end
38
+
39
+ def self.aws_installation_step
40
+ {
41
+ :action_on_failure => 'TERMINATE_JOB_FLOW',
42
+ :hadoop_jar_step => {
43
+ :jar => 's3://elasticmapreduce/libs/script-runner/script-runner.jar',
44
+ :args => [
45
+ 's3://elasticmapreduce/libs/hive/hive-script',
46
+ '--base-path',
47
+ 's3://elasticmapreduce/libs/hive/',
48
+ '--install-hive'
49
+ ],
50
+ },
51
+ :name => 'Elasticity - Install Hive'
52
+ }
53
+ end
54
+
55
+ end
56
+
57
+ end
@@ -1,51 +1,121 @@
1
1
  module Elasticity
2
2
 
3
+ class JobFlowRunningError < StandardError; end
4
+ class JobFlowNotStartedError < StandardError; end
5
+ class JobFlowMissingStepsError < StandardError; end
6
+
3
7
  class JobFlow
4
8
 
9
+ attr_accessor :action_on_failure
10
+ attr_accessor :ec2_key_name
5
11
  attr_accessor :name
6
- attr_accessor :jobflow_id
7
- attr_accessor :state
8
- attr_accessor :steps
9
- attr_accessor :created_at
10
- attr_accessor :started_at
11
- attr_accessor :ready_at
12
+ attr_accessor :hadoop_version
12
13
  attr_accessor :instance_count
14
+ attr_accessor :log_uri
13
15
  attr_accessor :master_instance_type
14
16
  attr_accessor :slave_instance_type
15
- attr_accessor :last_state_change_reason
16
-
17
- def initialize
18
- @steps = []
19
- end
20
-
21
- # Create a jobflow from an AWS <member> (Nokogiri::XML::Element):
22
- # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member
23
- def self.from_member_element(xml_element)
24
- jobflow = JobFlow.new
25
- jobflow.name = xml_element.xpath("./Name").text.strip
26
- jobflow.jobflow_id = xml_element.xpath("./JobFlowId").text.strip
27
- jobflow.state = xml_element.xpath("./ExecutionStatusDetail/State").text.strip
28
- jobflow.last_state_change_reason = xml_element.xpath("./ExecutionStatusDetail/LastStateChangeReason").text.strip
29
- jobflow.steps = JobFlowStep.from_members_nodeset(xml_element.xpath("./Steps/member"))
30
- jobflow.created_at = Time.parse(xml_element.xpath("./ExecutionStatusDetail/CreationDateTime").text.strip)
31
- started_at = xml_element.xpath("./ExecutionStatusDetail/StartDateTime").text.strip
32
- jobflow.started_at = (started_at == "") ? (nil) : (Time.parse(started_at))
33
- ready_at = xml_element.xpath("./ExecutionStatusDetail/ReadyDateTime").text.strip
34
- jobflow.ready_at = (ready_at == "") ? (nil) : (Time.parse(ready_at))
35
- jobflow.instance_count = xml_element.xpath("./Instances/InstanceCount").text.strip
36
- jobflow.master_instance_type = xml_element.xpath("./Instances/MasterInstanceType").text.strip
37
- jobflow.slave_instance_type = xml_element.xpath("./Instances/SlaveInstanceType").text.strip
38
- jobflow
39
- end
40
-
41
- # Create JobFlows from a collection of AWS <member> nodes (Nokogiri::XML::NodeSet):
42
- # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows
43
- def self.from_members_nodeset(members_nodeset)
44
- jobflows = []
45
- members_nodeset.each do |member|
46
- jobflows << from_member_element(member)
17
+ attr_accessor :ami_version
18
+ attr_accessor :keep_job_flow_alive_when_no_steps
19
+ attr_accessor :ec2_subnet_id
20
+
21
+ def initialize(access, secret)
22
+ @action_on_failure = 'TERMINATE_JOB_FLOW'
23
+ @ec2_key_name = 'default'
24
+ @hadoop_version = '0.20.205'
25
+ @instance_count = 2
26
+ @master_instance_type = 'm1.small'
27
+ @name = 'Elasticity Job Flow'
28
+ @slave_instance_type = 'm1.small'
29
+ @ami_version = 'latest'
30
+ @keep_job_flow_alive_when_no_steps = false
31
+
32
+ @emr = Elasticity::EMR.new(access, secret)
33
+
34
+ @bootstrap_actions = []
35
+ @jobflow_steps = []
36
+ @installed_steps = []
37
+ end
38
+
39
+ def instance_count=(count)
40
+ raise ArgumentError, 'Instance count cannot be set to less than 2 (requested 1)' unless count > 1
41
+ @instance_count = count
42
+ end
43
+
44
+ def add_bootstrap_action(bootstrap_action)
45
+ raise_if is_jobflow_running?, JobFlowRunningError, 'To modify bootstrap actions, please create a new job flow.'
46
+ @bootstrap_actions << bootstrap_action
47
+ end
48
+
49
+ def add_step(jobflow_step)
50
+ if is_jobflow_running?
51
+ jobflow_steps = []
52
+ if jobflow_step.class.send(:requires_installation?) && !@installed_steps.include?(jobflow_step.class)
53
+ jobflow_steps << jobflow_step.class.send(:aws_installation_step)
54
+ end
55
+ jobflow_steps << jobflow_step.to_aws_step(self)
56
+ @emr.add_jobflow_steps(@jobflow_id, {:steps => jobflow_steps})
57
+ else
58
+ @jobflow_steps << jobflow_step
59
+ end
60
+ end
61
+
62
+ def run
63
+ raise_if @jobflow_steps.empty?, JobFlowMissingStepsError, 'Cannot run a job flow without adding steps. Please use #add_step.'
64
+ raise_if is_jobflow_running?, JobFlowRunningError, 'Cannot run a job flow multiple times. To do more with this job flow, please use #add_step.'
65
+ @jobflow_id ||= @emr.run_job_flow(jobflow_config)
66
+ end
67
+
68
+ def shutdown
69
+ raise_unless is_jobflow_running?, JobFlowNotStartedError, 'Cannot #shutdown a job flow that has not yet been #run.'
70
+ @emr.terminate_jobflows(@jobflow_id)
71
+ end
72
+
73
+ def status
74
+ raise_unless is_jobflow_running?, JobFlowNotStartedError, 'Please #run this job flow before attempting to retrieve status.'
75
+ @emr.describe_jobflow(@jobflow_id)
76
+ end
77
+
78
+ private
79
+
80
+ def is_jobflow_running?
81
+ @jobflow_id
82
+ end
83
+
84
+ def jobflow_config
85
+ config = jobflow_preamble
86
+ config[:steps] = jobflow_steps
87
+ config[:log_uri] = @log_uri if @log_uri
88
+ config[:bootstrap_actions] = @bootstrap_actions.map{|a| a.to_aws_bootstrap_action} unless @bootstrap_actions.empty?
89
+ config
90
+ end
91
+
92
+ def jobflow_preamble
93
+ {
94
+ :name => @name,
95
+ :ami_version => @ami_version,
96
+ :instances => {
97
+ :keep_job_flow_alive_when_no_steps => @keep_job_flow_alive_when_no_steps,
98
+ :ec2_key_name => @ec2_key_name,
99
+ :hadoop_version => @hadoop_version,
100
+ :instance_count => @instance_count,
101
+ :master_instance_type => @master_instance_type,
102
+ :slave_instance_type => @slave_instance_type,
103
+ }
104
+ }.tap do |preamble|
105
+ preamble.merge!(:ec2_subnet_id => @ec2_subnet_id) if @ec2_subnet_id
106
+ end
107
+ end
108
+
109
+ def jobflow_steps
110
+ steps = []
111
+ @jobflow_steps.each do |step|
112
+ if step.class.send(:requires_installation?) && !@installed_steps.include?(step.class)
113
+ steps << step.class.send(:aws_installation_step)
114
+ @installed_steps << step.class
115
+ end
116
+ steps << step.to_aws_step(self)
47
117
  end
48
- jobflows
118
+ steps
49
119
  end
50
120
 
51
121
  end
@@ -0,0 +1,53 @@
1
+ module Elasticity
2
+
3
+ class JobFlowStatus
4
+
5
+ attr_accessor :name
6
+ attr_accessor :jobflow_id
7
+ attr_accessor :state
8
+ attr_accessor :steps
9
+ attr_accessor :created_at
10
+ attr_accessor :started_at
11
+ attr_accessor :ready_at
12
+ attr_accessor :instance_count
13
+ attr_accessor :master_instance_type
14
+ attr_accessor :slave_instance_type
15
+ attr_accessor :last_state_change_reason
16
+
17
+ def initialize
18
+ @steps = []
19
+ end
20
+
21
+ # Create a jobflow from an AWS <member> (Nokogiri::XML::Element):
22
+ # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member
23
+ def self.from_member_element(xml_element)
24
+ jobflow = JobFlowStatus.new
25
+ jobflow.name = xml_element.xpath('./Name').text.strip
26
+ jobflow.jobflow_id = xml_element.xpath('./JobFlowId').text.strip
27
+ jobflow.state = xml_element.xpath('./ExecutionStatusDetail/State').text.strip
28
+ jobflow.last_state_change_reason = xml_element.xpath('./ExecutionStatusDetail/LastStateChangeReason').text.strip
29
+ jobflow.steps = JobFlowStatusStep.from_members_nodeset(xml_element.xpath('./Steps/member'))
30
+ jobflow.created_at = Time.parse(xml_element.xpath('./ExecutionStatusDetail/CreationDateTime').text.strip)
31
+ started_at = xml_element.xpath('./ExecutionStatusDetail/StartDateTime').text.strip
32
+ jobflow.started_at = (started_at == '') ? (nil) : (Time.parse(started_at))
33
+ ready_at = xml_element.xpath('./ExecutionStatusDetail/ReadyDateTime').text.strip
34
+ jobflow.ready_at = (ready_at == '') ? (nil) : (Time.parse(ready_at))
35
+ jobflow.instance_count = xml_element.xpath('./Instances/InstanceCount').text.strip
36
+ jobflow.master_instance_type = xml_element.xpath('./Instances/MasterInstanceType').text.strip
37
+ jobflow.slave_instance_type = xml_element.xpath('./Instances/SlaveInstanceType').text.strip
38
+ jobflow
39
+ end
40
+
41
+ # Create JobFlows from a collection of AWS <member> nodes (Nokogiri::XML::NodeSet):
42
+ # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows
43
+ def self.from_members_nodeset(members_nodeset)
44
+ jobflows = []
45
+ members_nodeset.each do |member|
46
+ jobflows << from_member_element(member)
47
+ end
48
+ jobflows
49
+ end
50
+
51
+ end
52
+
53
+ end
@@ -0,0 +1,35 @@
1
+ module Elasticity
2
+
3
+ class JobFlowStatusStep
4
+
5
+ attr_accessor :name
6
+ attr_accessor :state
7
+ attr_accessor :started_at
8
+ attr_accessor :ended_at
9
+
10
+ # Create a job flow from an AWS <member> (Nokogiri::XML::Element):
11
+ # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member/Steps/member
12
+ def self.from_member_element(xml_element)
13
+ job_flow_step = JobFlowStatusStep.new
14
+ job_flow_step.name = xml_element.xpath("./StepConfig/Name").text.strip
15
+ job_flow_step.state = xml_element.xpath("./ExecutionStatusDetail/State").text.strip
16
+ started_at = xml_element.xpath("./ExecutionStatusDetail/StartDateTime").text.strip
17
+ job_flow_step.started_at = (started_at == "") ? (nil) : (Time.parse(started_at))
18
+ ended_at = xml_element.xpath("./ExecutionStatusDetail/EndDateTime").text.strip
19
+ job_flow_step.ended_at = (ended_at == "") ? (nil) : (Time.parse(ended_at))
20
+ job_flow_step
21
+ end
22
+
23
+ # Create JobFlowSteps from a collection of AWS <member> nodes (Nokogiri::XML::NodeSet):
24
+ # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member/Steps/member
25
+ def self.from_members_nodeset(members_nodeset)
26
+ jobflow_steps = []
27
+ members_nodeset.each do |member|
28
+ jobflow_steps << from_member_element(member)
29
+ end
30
+ jobflow_steps
31
+ end
32
+
33
+ end
34
+
35
+ end
@@ -1,33 +1,25 @@
1
1
  module Elasticity
2
2
 
3
- class JobFlowStep
4
-
5
- attr_accessor :name
6
- attr_accessor :state
7
- attr_accessor :started_at
8
- attr_accessor :ended_at
9
-
10
- # Create a job flow from an AWS <member> (Nokogiri::XML::Element):
11
- # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member/Steps/member
12
- def self.from_member_element(xml_element)
13
- job_flow_step = JobFlowStep.new
14
- job_flow_step.name = xml_element.xpath("./StepConfig/Name").text.strip
15
- job_flow_step.state = xml_element.xpath("./ExecutionStatusDetail/State").text.strip
16
- started_at = xml_element.xpath("./ExecutionStatusDetail/StartDateTime").text.strip
17
- job_flow_step.started_at = (started_at == "") ? (nil) : (Time.parse(started_at))
18
- ended_at = xml_element.xpath("./ExecutionStatusDetail/EndDateTime").text.strip
19
- job_flow_step.ended_at = (ended_at == "") ? (nil) : (Time.parse(ended_at))
20
- job_flow_step
3
+ module JobFlowStep
4
+
5
+ def to_aws_step(jobflow_step)
6
+ raise RuntimeError, '#to_aws_step is required to be defined on all job flow steps.'
21
7
  end
22
8
 
23
- # Create JobFlowSteps from a collection of AWS <member> nodes (Nokogiri::XML::NodeSet):
24
- # /DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member/Steps/member
25
- def self.from_members_nodeset(members_nodeset)
26
- jobflow_steps = []
27
- members_nodeset.each do |member|
28
- jobflow_steps << from_member_element(member)
9
+ module ClassMethods
10
+
11
+ def requires_installation?
12
+ false
13
+ end
14
+
15
+ def aws_installation_step
16
+ raise RuntimeError, '.aws_installation_step is required to be defined when a step requires installation (e.g. Pig, Hive).'
29
17
  end
30
- jobflow_steps
18
+
19
+ end
20
+
21
+ def self.included(base)
22
+ base.extend(ClassMethods)
31
23
  end
32
24
 
33
25
  end
@@ -0,0 +1,82 @@
1
+ module Elasticity
2
+
3
+ class PigStep
4
+
5
+ include JobFlowStep
6
+
7
+ attr_accessor :name
8
+ attr_accessor :script
9
+ attr_accessor :variables
10
+ attr_accessor :action_on_failure
11
+
12
+ def initialize(script)
13
+ @name = "Elasticity Pig Step (#{script})"
14
+ @script = script
15
+ @variables = { }
16
+ @action_on_failure = 'TERMINATE_JOB_FLOW'
17
+ end
18
+
19
+ def to_aws_step(job_flow)
20
+ args = %w(s3://elasticmapreduce/libs/pig/pig-script --run-pig-script --args)
21
+ @variables.keys.sort.each do |name|
22
+ args.concat(['-p', "#{name}=#{@variables[name]}"])
23
+ end
24
+ args.concat(['-p', "E_PARALLELS=#{parallels(job_flow.slave_instance_type, job_flow.instance_count)}"])
25
+ args << @script
26
+ {
27
+ :action_on_failure => @action_on_failure,
28
+ :hadoop_jar_step => {
29
+ :jar => 's3://elasticmapreduce/libs/script-runner/script-runner.jar',
30
+ :args => args,
31
+ },
32
+ :name => @name
33
+ }
34
+ end
35
+
36
+ def self.requires_installation?
37
+ true
38
+ end
39
+
40
+ def self.aws_installation_step
41
+ {
42
+ :action_on_failure => 'TERMINATE_JOB_FLOW',
43
+ :hadoop_jar_step => {
44
+ :jar => 's3://elasticmapreduce/libs/script-runner/script-runner.jar',
45
+ :args => [
46
+ 's3://elasticmapreduce/libs/pig/pig-script',
47
+ '--base-path',
48
+ 's3://elasticmapreduce/libs/pig/',
49
+ '--install-pig'
50
+ ],
51
+ },
52
+ :name => 'Elasticity - Install Pig'
53
+ }
54
+ end
55
+
56
+ private
57
+
58
+ # Calculate a common-sense default value of PARALLELS using the following
59
+ # formula from the Pig Cookbook:
60
+ #
61
+ # <num machines> * <num reduce slots per machine> * 0.9
62
+ #
63
+ # With the following reducer configuration (from an AWS forum post):
64
+ #
65
+ # m1.small 1
66
+ # m1.large 2
67
+ # m1.xlarge 4
68
+ # c1.medium 2
69
+ # c1.xlarge 4
70
+ def parallels(slave_instance_type, instance_count)
71
+ reduce_slots = Hash.new(1)
72
+ reduce_slots['m1.small'] = 1
73
+ reduce_slots['m1.large'] = 2
74
+ reduce_slots['m1.xlarge'] = 4
75
+ reduce_slots['c1.medium'] = 2
76
+ reduce_slots['c1.xlarge'] = 4
77
+ ((instance_count - 1).to_f * reduce_slots[slave_instance_type].to_f * 0.9).ceil
78
+ end
79
+
80
+ end
81
+
82
+ end