wakoopa-elasticity 1.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/.autotest +2 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +2 -0
  4. data/.rvmrc +1 -0
  5. data/Gemfile +4 -0
  6. data/HISTORY.mediawiki +30 -0
  7. data/LICENSE +202 -0
  8. data/README.mediawiki +332 -0
  9. data/Rakefile +11 -0
  10. data/elasticity.gemspec +29 -0
  11. data/lib/elasticity.rb +16 -0
  12. data/lib/elasticity/aws_request.rb +52 -0
  13. data/lib/elasticity/emr.rb +282 -0
  14. data/lib/elasticity/hive_job.rb +71 -0
  15. data/lib/elasticity/job_flow.rb +53 -0
  16. data/lib/elasticity/job_flow_step.rb +36 -0
  17. data/lib/elasticity/pig_job.rb +112 -0
  18. data/lib/elasticity/simple_job.rb +50 -0
  19. data/lib/elasticity/version.rb +3 -0
  20. data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_successful.yml +38 -0
  21. data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_unsuccessful.yml +35 -0
  22. data/spec/fixtures/vcr_cassettes/add_jobflow_steps/add_multiple_steps.yml +252 -0
  23. data/spec/fixtures/vcr_cassettes/describe_jobflows/all_jobflows.yml +69 -0
  24. data/spec/fixtures/vcr_cassettes/direct/terminate_jobflow.yml +32 -0
  25. data/spec/fixtures/vcr_cassettes/hive_job/hive_ads.yml +35 -0
  26. data/spec/fixtures/vcr_cassettes/modify_instance_groups/set_instances_to_3.yml +32 -0
  27. data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports.yml +35 -0
  28. data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports_with_bootstrap.yml +35 -0
  29. data/spec/fixtures/vcr_cassettes/run_jobflow/word_count.yml +35 -0
  30. data/spec/fixtures/vcr_cassettes/set_termination_protection/nonexistent_job_flows.yml +35 -0
  31. data/spec/fixtures/vcr_cassettes/set_termination_protection/protect_multiple_job_flows.yml +32 -0
  32. data/spec/fixtures/vcr_cassettes/terminate_jobflows/one_jobflow.yml +32 -0
  33. data/spec/lib/elasticity/aws_request_spec.rb +62 -0
  34. data/spec/lib/elasticity/emr_spec.rb +794 -0
  35. data/spec/lib/elasticity/hive_job_spec.rb +96 -0
  36. data/spec/lib/elasticity/job_flow_spec.rb +139 -0
  37. data/spec/lib/elasticity/job_flow_step_spec.rb +76 -0
  38. data/spec/lib/elasticity/pig_job_spec.rb +211 -0
  39. data/spec/spec_helper.rb +43 -0
  40. metadata +253 -0
@@ -0,0 +1,11 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require 'rake/testtask'
5
+ require 'rspec/core/rake_task'
6
+
7
+ desc 'Run specs'
8
+ task :default => :spec
9
+
10
+ desc "Run specs"
11
+ RSpec::Core::RakeTask.new
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "elasticity/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "wakoopa-elasticity"
7
+ s.version = Elasticity::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Robert Slifka"]
10
+ s.homepage = "http://www.github.com/rslifka/elasticity"
11
+ s.summary = %q{Programmatic access to Amazon's Elastic Map Reduce service.}
12
+ s.description = %q{Programmatic access to Amazon's Elastic Map Reduce service.}
13
+
14
+ s.add_dependency("rest-client")
15
+ s.add_dependency("nokogiri")
16
+
17
+ s.add_development_dependency("autotest-fsevent")
18
+ s.add_development_dependency("autotest-growl")
19
+ s.add_development_dependency("rake")
20
+ s.add_development_dependency("rspec", ">= 2.5.0")
21
+ s.add_development_dependency("vcr", ">= 1.5.1")
22
+ s.add_development_dependency("webmock", ">= 1.6.2")
23
+ s.add_development_dependency("ZenTest")
24
+
25
+ s.files = `git ls-files`.split("\n")
26
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
27
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
28
+ s.require_paths = ["lib"]
29
+ end
@@ -0,0 +1,16 @@
1
+ require 'base64'
2
+
3
+ require 'rest_client'
4
+ require 'nokogiri'
5
+
6
+ require 'elasticity/aws_request'
7
+ require 'elasticity/emr'
8
+ require 'elasticity/job_flow'
9
+ require 'elasticity/job_flow_step'
10
+
11
+ require 'elasticity/simple_job'
12
+ require 'elasticity/hive_job'
13
+ require 'elasticity/pig_job'
14
+
15
+ module Elasticity
16
+ end
@@ -0,0 +1,52 @@
1
+ module Elasticity
2
+
3
+ class AwsRequest
4
+
5
+ def initialize(aws_access_key_id, aws_secret_access_key, options = {})
6
+ @access_key = aws_access_key_id
7
+ @secret_key = aws_secret_access_key
8
+ @options = {:secure => true}.merge(options)
9
+ end
10
+
11
+ def aws_emr_request(params)
12
+ host = @options[:region] ? "elasticmapreduce.#{@options[:region]}.amazonaws.com" : "elasticmapreduce.amazonaws.com"
13
+ protocol = @options[:secure] ? "https" : "http"
14
+
15
+ signed_params = sign_params(params, "GET", host, "/")
16
+ signed_request = "#{protocol}://#{host}?#{signed_params}"
17
+ RestClient.get signed_request
18
+ end
19
+
20
+ # (Used from RightScale's right_aws gem.)
21
+ # EC2, SQS, SDB and EMR requests must be signed by this guy.
22
+ # See: http://docs.amazonwebservices.com/AmazonSimpleDB/2007-11-07/DeveloperGuide/index.html?REST_RESTAuth.html
23
+ # http://developer.amazonwebservices.com/connect/entry.jspa?externalID=1928
24
+ def sign_params(service_hash, http_verb, host, uri)
25
+ service_hash["AWSAccessKeyId"] = @access_key
26
+ service_hash["Timestamp"] = Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S.000Z")
27
+ service_hash["SignatureVersion"] = "2"
28
+ service_hash['SignatureMethod'] = 'HmacSHA256'
29
+ canonical_string = service_hash.keys.sort.map do |key|
30
+ "#{AwsRequest.aws_escape(key)}=#{AwsRequest.aws_escape(service_hash[key])}"
31
+ end.join('&')
32
+ string_to_sign = "#{http_verb.to_s.upcase}\n#{host.downcase}\n#{uri}\n#{canonical_string}"
33
+ signature = AwsRequest.aws_escape(Base64.encode64(OpenSSL::HMAC.digest("sha256", @secret_key, string_to_sign)).strip)
34
+ "#{canonical_string}&Signature=#{signature}"
35
+ end
36
+
37
+ class << self
38
+
39
+ # (Used from RightScale's right_aws gem)
40
+ # Escape a string according to Amazon's rules.
41
+ # See: http://docs.amazonwebservices.com/AmazonSimpleDB/2007-11-07/DeveloperGuide/index.html?REST_RESTAuth.html
42
+ def aws_escape(param)
43
+ param.to_s.gsub(/([^a-zA-Z0-9._~-]+)/n) do
44
+ '%' + $1.unpack('H2' * $1.size).join('%').upcase
45
+ end
46
+ end
47
+
48
+ end
49
+
50
+ end
51
+
52
+ end
@@ -0,0 +1,282 @@
1
+ module Elasticity
2
+
3
+ class EMR
4
+
5
+ def initialize(aws_access_key_id, aws_secret_access_key, options = {})
6
+ @aws_request = Elasticity::AwsRequest.new(aws_access_key_id, aws_secret_access_key, options)
7
+ end
8
+
9
+ # Lists all jobflows in all states.
10
+ def describe_jobflows(params = {})
11
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(
12
+ params.merge({:operation => "DescribeJobFlows"}))
13
+ )
14
+ xml_doc = Nokogiri::XML(aws_result)
15
+ xml_doc.remove_namespaces!
16
+ yield aws_result if block_given?
17
+ JobFlow.from_members_nodeset(xml_doc.xpath("/DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member"))
18
+ end
19
+
20
+ # Adds a new group of instances to the specified jobflow. Elasticity maps a
21
+ # more Ruby-like syntax to the Amazon options. An exhaustive hash follows although
22
+ # not all of these options are required (or valid!) at once. Please see the
23
+ # EMR docs for details although even then you're going to need to experiment :)
24
+ #
25
+ # instance_group_config = {
26
+ # :bid_price => 5,
27
+ # :instance_count => 1,
28
+ # :instance_role => "TASK",
29
+ # :market => "SPOT",
30
+ # :name => "Go Canucks Go!"
31
+ # :type => "m1.small",
32
+ # }
33
+ #
34
+ # add_instance_groups takes an array of {}. Returns an array of the instance IDs
35
+ # that were created by the specified configs.
36
+ #
37
+ # ["ig-2GOVEN6HVJZID", "ig-1DU9M2UQMM051", "ig-3DZRW4Y2X4S", ...]
38
+ def add_instance_groups(jobflow_id, instance_group_configs)
39
+ params = {
40
+ :operation => "AddInstanceGroups",
41
+ :job_flow_id => jobflow_id,
42
+ :instance_groups => instance_group_configs
43
+ }
44
+ begin
45
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
46
+ xml_doc = Nokogiri::XML(aws_result)
47
+ xml_doc.remove_namespaces!
48
+ instance_group_ids = []
49
+ xml_doc.xpath("/AddInstanceGroupsResponse/AddInstanceGroupsResult/InstanceGroupIds/member").each do |member|
50
+ instance_group_ids << member.text
51
+ end
52
+ yield aws_result if block_given?
53
+ instance_group_ids
54
+ rescue RestClient::BadRequest => e
55
+ raise ArgumentError, EMR.parse_error_response(e.http_body)
56
+ end
57
+ end
58
+
59
+ # Add a step (or steps) to the specified job flow.
60
+ #
61
+ # emr.add_jobflow_step("j-123", {
62
+ # :steps => [
63
+ # {
64
+ # :action_on_failure => "TERMINATE_JOB_FLOW",
65
+ # :hadoop_jar_step => {
66
+ # :args => [
67
+ # "s3://elasticmapreduce/libs/pig/pig-script",
68
+ # "--base-path",
69
+ # "s3://elasticmapreduce/libs/pig/",
70
+ # "--install-pig"
71
+ # ],
72
+ # :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar"
73
+ # },
74
+ # :name => "Setup Pig"
75
+ # }
76
+ # ]
77
+ # })
78
+ def add_jobflow_steps(jobflow_id, steps_config)
79
+ params = {
80
+ :operation => "AddJobFlowSteps",
81
+ :job_flow_id => jobflow_id
82
+ }.merge!(steps_config)
83
+ begin
84
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
85
+ yield aws_result if block_given?
86
+ rescue RestClient::BadRequest => e
87
+ raise ArgumentError, EMR.parse_error_response(e.http_body)
88
+ end
89
+ end
90
+
91
+ # Set the number of instances in the specified instance groups to the
92
+ # specified counts. Note that this modifies the *request* count, which
93
+ # is not the same as the *running* count. I.e. you request instances
94
+ # and then wait for them to be created.
95
+ #
96
+ # Takes a {} of instance group IDs => desired instance count.
97
+ #
98
+ # {"ig-1" => 40, "ig-2" => 5, ...}
99
+ def modify_instance_groups(instance_group_config)
100
+ params = {
101
+ :operation => "ModifyInstanceGroups",
102
+ :instance_groups => instance_group_config.map { |k, v| {:instance_group_id => k, :instance_count => v} }
103
+ }
104
+ begin
105
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
106
+ yield aws_result if block_given?
107
+ rescue RestClient::BadRequest => e
108
+ raise ArgumentError, EMR.parse_error_response(e.http_body)
109
+ end
110
+ end
111
+
112
+ # Start a job flow with the specified configuration. This is a very thin
113
+ # wrapper around the AWS API, so in order to use it directly you'll need
114
+ # to have the PDF API reference handy, which can be found here:
115
+ #
116
+ # http://awsdocs.s3.amazonaws.com/ElasticMapReduce/20090331/emr-api-20090331.pdf
117
+ #
118
+ # Here is a sample job flow configuration that should help. This job flow
119
+ # starts by installing Pig then running a Pig script. It is based off of the
120
+ # Pig demo script from Amazon.
121
+ #
122
+ # emr.run_job_flow({
123
+ # :name => "Elasticity Test Flow (EMR Pig Script)",
124
+ # :instances => {
125
+ # :ec2_key_name => "sharethrough-dev",
126
+ # :hadoop_version => "0.20",
127
+ # :instance_count => 2,
128
+ # :master_instance_type => "m1.small",
129
+ # :placement => {
130
+ # :availability_zone => "us-east-1a"
131
+ # },
132
+ # :slave_instance_type => "m1.small",
133
+ # },
134
+ # :steps => [
135
+ # {
136
+ # :action_on_failure => "TERMINATE_JOB_FLOW",
137
+ # :hadoop_jar_step => {
138
+ # :args => [
139
+ # "s3://elasticmapreduce/libs/pig/pig-script",
140
+ # "--base-path",
141
+ # "s3://elasticmapreduce/libs/pig/",
142
+ # "--install-pig"
143
+ # ],
144
+ # :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar"
145
+ # },
146
+ # :name => "Setup Pig"
147
+ # },
148
+ # {
149
+ # :action_on_failure => "TERMINATE_JOB_FLOW",
150
+ # :hadoop_jar_step => {
151
+ # :args => [
152
+ # "s3://elasticmapreduce/libs/pig/pig-script",
153
+ # "--run-pig-script",
154
+ # "--args",
155
+ # "-p",
156
+ # "INPUT=s3n://elasticmapreduce/samples/pig-apache/input",
157
+ # "-p",
158
+ # "OUTPUT=s3n://slif-elasticity/pig-apache/output/2011-04-19",
159
+ # "s3n://elasticmapreduce/samples/pig-apache/do-reports.pig"
160
+ # ],
161
+ # :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar"
162
+ # },
163
+ # :name => "Run Pig Script"
164
+ # }
165
+ # ]
166
+ # })
167
+ def run_job_flow(job_flow_config)
168
+ params = {
169
+ :operation => "RunJobFlow",
170
+ }.merge!(job_flow_config)
171
+ begin
172
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
173
+ yield aws_result if block_given?
174
+ xml_doc = Nokogiri::XML(aws_result)
175
+ xml_doc.remove_namespaces!
176
+ xml_doc.xpath("/RunJobFlowResponse/RunJobFlowResult/JobFlowId").text
177
+ rescue RestClient::BadRequest => e
178
+ raise ArgumentError, EMR.parse_error_response(e.http_body)
179
+ end
180
+ end
181
+
182
+ # Enabled or disable "termination protection" on the specified job flows.
183
+ # Termination protection prevents a job flow from being terminated by a
184
+ # user initiated action, although the job flow will still terminate
185
+ # naturally.
186
+ #
187
+ # Takes an [] of job flow IDs.
188
+ #
189
+ # ["j-1B4D1XP0C0A35", "j-1YG2MYL0HVYS5", ...]
190
+ def set_termination_protection(jobflow_ids, protection_enabled=true)
191
+ params = {
192
+ :operation => "SetTerminationProtection",
193
+ :termination_protected => protection_enabled,
194
+ :job_flow_ids => jobflow_ids
195
+ }
196
+ begin
197
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
198
+ yield aws_result if block_given?
199
+ rescue RestClient::BadRequest => e
200
+ raise ArgumentError, EMR.parse_error_response(e.http_body)
201
+ end
202
+ end
203
+
204
+ # Terminate the specified jobflow. Amazon does not define a return value
205
+ # for this operation, so you'll need to poll #describe_jobflows to see
206
+ # the state of the jobflow. Raises ArgumentError if the specified job
207
+ # flow does not exist.
208
+ def terminate_jobflows(jobflow_id)
209
+ params = {
210
+ :operation => "TerminateJobFlows",
211
+ :job_flow_ids => [jobflow_id]
212
+ }
213
+ begin
214
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
215
+ yield aws_result if block_given?
216
+ rescue RestClient::BadRequest
217
+ raise ArgumentError, "Job flow '#{jobflow_id}' does not exist."
218
+ end
219
+ end
220
+
221
+ # Pass the specified params hash directly through to the AWS request URL.
222
+ # Use this if you want to perform an operation that hasn't yet been wrapped
223
+ # by Elasticity or you just want to see the response XML for yourself :)
224
+ def direct(params)
225
+ @aws_request.aws_emr_request(params)
226
+ end
227
+
228
+ private
229
+
230
+ class << self
231
+
232
+ # AWS error responses all follow the same form. Extract the message from
233
+ # the error document.
234
+ def parse_error_response(error_xml)
235
+ xml_doc = Nokogiri::XML(error_xml)
236
+ xml_doc.remove_namespaces!
237
+ xml_doc.xpath("/ErrorResponse/Error/Message").text
238
+ end
239
+
240
+ # Since we use the same structure as AWS, we can generate AWS param names
241
+ # from the Ruby versions of those names (and the param nesting).
242
+ def convert_ruby_to_aws(params)
243
+ result = {}
244
+ params.each do |key, value|
245
+ case value
246
+ when Array
247
+ prefix = "#{camelize(key.to_s)}.member"
248
+ value.each_with_index do |item, index|
249
+ if item.is_a?(String)
250
+ result["#{prefix}.#{index+1}"] = item
251
+ else
252
+ convert_ruby_to_aws(item).each do |nested_key, nested_value|
253
+ result["#{prefix}.#{index+1}.#{nested_key}"] = nested_value
254
+ end
255
+ end
256
+ end
257
+ when Hash
258
+ prefix = "#{camelize(key.to_s)}"
259
+ convert_ruby_to_aws(value).each do |nested_key, nested_value|
260
+ result["#{prefix}.#{nested_key}"] = nested_value
261
+ end
262
+ else
263
+ result[camelize(key.to_s)] = value
264
+ end
265
+ end
266
+ result
267
+ end
268
+
269
+ # (Used from Rails' ActiveSupport)
270
+ def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
271
+ if first_letter_in_uppercase
272
+ lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::" + $1.upcase }.gsub(/(^|_)(.)/) { $2.upcase }
273
+ else
274
+ lower_case_and_underscored_word.first + camelize(lower_case_and_underscored_word)[1..-1]
275
+ end
276
+ end
277
+
278
+ end
279
+
280
+ end
281
+
282
+ end
@@ -0,0 +1,71 @@
1
+ module Elasticity
2
+
3
+ # HiveJob allows you quickly easily kick off a Hive jobflow without
4
+ # having to understand the entirety of the EMR API.
5
+ class HiveJob < Elasticity::SimpleJob
6
+
7
+ def initialize(aws_access_key_id, aws_secret_access_key)
8
+ super
9
+ @name = "Elasticity Hive Job"
10
+ end
11
+
12
+ # Run the specified Hive script with the specified variables.
13
+ #
14
+ # hive = Elasticity::HiveJob.new("access", "secret")
15
+ # jobflow_id = hive.run('s3n://slif-hive/test.q', {
16
+ # 'SCRIPTS' => 's3n://slif-test/scripts',
17
+ # 'OUTPUT' => 's3n://slif-test/output',
18
+ # 'XREFS' => 's3n://slif-test/xrefs'
19
+ # })
20
+ #
21
+ # The variables are accessible within your Hive scripts by using the
22
+ # standard ${NAME} syntax. E.g.
23
+ #
24
+ # ADD JAR ${SCRIPTS}/jsonserde.jar;
25
+ def run(hive_script, hive_variables={})
26
+ script_arguments = ["s3://elasticmapreduce/libs/hive/hive-script", "--run-hive-script", "--args"]
27
+ script_arguments.concat(["-f", hive_script])
28
+ hive_variables.each do |variable_name, value|
29
+ script_arguments.concat(["-d", "#{variable_name}=#{value}"])
30
+ end
31
+ jobflow_config = {
32
+ :name => @name,
33
+ :instances => {
34
+ :ec2_key_name => @ec2_key_name,
35
+ :hadoop_version => @hadoop_version,
36
+ :instance_count => @instance_count,
37
+ :master_instance_type => @master_instance_type,
38
+ :slave_instance_type => @slave_instance_type,
39
+ },
40
+ :steps => [
41
+ {
42
+ :action_on_failure => "TERMINATE_JOB_FLOW",
43
+ :hadoop_jar_step => {
44
+ :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar",
45
+ :args => [
46
+ "s3://elasticmapreduce/libs/hive/hive-script",
47
+ "--base-path", "s3://elasticmapreduce/libs/hive/",
48
+ "--install-hive"
49
+ ],
50
+ },
51
+ :name => "Setup Hive"
52
+ },
53
+ {
54
+ :action_on_failure => @action_on_failure,
55
+ :hadoop_jar_step => {
56
+ :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar",
57
+ :args => script_arguments,
58
+ },
59
+ :name => "Run Hive Script"
60
+ }
61
+ ]
62
+ }
63
+
64
+ jobflow_config.merge!(:log_uri => @log_uri) if @log_uri
65
+
66
+ @emr.run_job_flow(jobflow_config)
67
+ end
68
+
69
+ end
70
+
71
+ end