wakoopa-elasticity 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/.autotest +2 -0
  2. data/.gitignore +5 -0
  3. data/.rspec +2 -0
  4. data/.rvmrc +1 -0
  5. data/Gemfile +4 -0
  6. data/HISTORY.mediawiki +30 -0
  7. data/LICENSE +202 -0
  8. data/README.mediawiki +332 -0
  9. data/Rakefile +11 -0
  10. data/elasticity.gemspec +29 -0
  11. data/lib/elasticity.rb +16 -0
  12. data/lib/elasticity/aws_request.rb +52 -0
  13. data/lib/elasticity/emr.rb +282 -0
  14. data/lib/elasticity/hive_job.rb +71 -0
  15. data/lib/elasticity/job_flow.rb +53 -0
  16. data/lib/elasticity/job_flow_step.rb +36 -0
  17. data/lib/elasticity/pig_job.rb +112 -0
  18. data/lib/elasticity/simple_job.rb +50 -0
  19. data/lib/elasticity/version.rb +3 -0
  20. data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_successful.yml +38 -0
  21. data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_unsuccessful.yml +35 -0
  22. data/spec/fixtures/vcr_cassettes/add_jobflow_steps/add_multiple_steps.yml +252 -0
  23. data/spec/fixtures/vcr_cassettes/describe_jobflows/all_jobflows.yml +69 -0
  24. data/spec/fixtures/vcr_cassettes/direct/terminate_jobflow.yml +32 -0
  25. data/spec/fixtures/vcr_cassettes/hive_job/hive_ads.yml +35 -0
  26. data/spec/fixtures/vcr_cassettes/modify_instance_groups/set_instances_to_3.yml +32 -0
  27. data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports.yml +35 -0
  28. data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports_with_bootstrap.yml +35 -0
  29. data/spec/fixtures/vcr_cassettes/run_jobflow/word_count.yml +35 -0
  30. data/spec/fixtures/vcr_cassettes/set_termination_protection/nonexistent_job_flows.yml +35 -0
  31. data/spec/fixtures/vcr_cassettes/set_termination_protection/protect_multiple_job_flows.yml +32 -0
  32. data/spec/fixtures/vcr_cassettes/terminate_jobflows/one_jobflow.yml +32 -0
  33. data/spec/lib/elasticity/aws_request_spec.rb +62 -0
  34. data/spec/lib/elasticity/emr_spec.rb +794 -0
  35. data/spec/lib/elasticity/hive_job_spec.rb +96 -0
  36. data/spec/lib/elasticity/job_flow_spec.rb +139 -0
  37. data/spec/lib/elasticity/job_flow_step_spec.rb +76 -0
  38. data/spec/lib/elasticity/pig_job_spec.rb +211 -0
  39. data/spec/spec_helper.rb +43 -0
  40. metadata +253 -0
@@ -0,0 +1,11 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
3
+
4
+ require 'rake/testtask'
5
+ require 'rspec/core/rake_task'
6
+
7
+ desc 'Run specs'
8
+ task :default => :spec
9
+
10
+ desc "Run specs"
11
+ RSpec::Core::RakeTask.new
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "elasticity/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "wakoopa-elasticity"
7
+ s.version = Elasticity::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Robert Slifka"]
10
+ s.homepage = "http://www.github.com/rslifka/elasticity"
11
+ s.summary = %q{Programmatic access to Amazon's Elastic Map Reduce service.}
12
+ s.description = %q{Programmatic access to Amazon's Elastic Map Reduce service.}
13
+
14
+ s.add_dependency("rest-client")
15
+ s.add_dependency("nokogiri")
16
+
17
+ s.add_development_dependency("autotest-fsevent")
18
+ s.add_development_dependency("autotest-growl")
19
+ s.add_development_dependency("rake")
20
+ s.add_development_dependency("rspec", ">= 2.5.0")
21
+ s.add_development_dependency("vcr", ">= 1.5.1")
22
+ s.add_development_dependency("webmock", ">= 1.6.2")
23
+ s.add_development_dependency("ZenTest")
24
+
25
+ s.files = `git ls-files`.split("\n")
26
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
27
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
28
+ s.require_paths = ["lib"]
29
+ end
@@ -0,0 +1,16 @@
1
+ require 'base64'
2
+
3
+ require 'rest_client'
4
+ require 'nokogiri'
5
+
6
+ require 'elasticity/aws_request'
7
+ require 'elasticity/emr'
8
+ require 'elasticity/job_flow'
9
+ require 'elasticity/job_flow_step'
10
+
11
+ require 'elasticity/simple_job'
12
+ require 'elasticity/hive_job'
13
+ require 'elasticity/pig_job'
14
+
15
+ module Elasticity
16
+ end
@@ -0,0 +1,52 @@
1
+ module Elasticity
2
+
3
+ class AwsRequest
4
+
5
+ def initialize(aws_access_key_id, aws_secret_access_key, options = {})
6
+ @access_key = aws_access_key_id
7
+ @secret_key = aws_secret_access_key
8
+ @options = {:secure => true}.merge(options)
9
+ end
10
+
11
+ def aws_emr_request(params)
12
+ host = @options[:region] ? "elasticmapreduce.#{@options[:region]}.amazonaws.com" : "elasticmapreduce.amazonaws.com"
13
+ protocol = @options[:secure] ? "https" : "http"
14
+
15
+ signed_params = sign_params(params, "GET", host, "/")
16
+ signed_request = "#{protocol}://#{host}?#{signed_params}"
17
+ RestClient.get signed_request
18
+ end
19
+
20
+ # (Used from RightScale's right_aws gem.)
21
+ # EC2, SQS, SDB and EMR requests must be signed by this guy.
22
+ # See: http://docs.amazonwebservices.com/AmazonSimpleDB/2007-11-07/DeveloperGuide/index.html?REST_RESTAuth.html
23
+ # http://developer.amazonwebservices.com/connect/entry.jspa?externalID=1928
24
+ def sign_params(service_hash, http_verb, host, uri)
25
+ service_hash["AWSAccessKeyId"] = @access_key
26
+ service_hash["Timestamp"] = Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S.000Z")
27
+ service_hash["SignatureVersion"] = "2"
28
+ service_hash['SignatureMethod'] = 'HmacSHA256'
29
+ canonical_string = service_hash.keys.sort.map do |key|
30
+ "#{AwsRequest.aws_escape(key)}=#{AwsRequest.aws_escape(service_hash[key])}"
31
+ end.join('&')
32
+ string_to_sign = "#{http_verb.to_s.upcase}\n#{host.downcase}\n#{uri}\n#{canonical_string}"
33
+ signature = AwsRequest.aws_escape(Base64.encode64(OpenSSL::HMAC.digest("sha256", @secret_key, string_to_sign)).strip)
34
+ "#{canonical_string}&Signature=#{signature}"
35
+ end
36
+
37
+ class << self
38
+
39
+ # (Used from RightScale's right_aws gem)
40
+ # Escape a string according to Amazon's rules.
41
+ # See: http://docs.amazonwebservices.com/AmazonSimpleDB/2007-11-07/DeveloperGuide/index.html?REST_RESTAuth.html
42
+ def aws_escape(param)
43
+ param.to_s.gsub(/([^a-zA-Z0-9._~-]+)/n) do
44
+ '%' + $1.unpack('H2' * $1.size).join('%').upcase
45
+ end
46
+ end
47
+
48
+ end
49
+
50
+ end
51
+
52
+ end
@@ -0,0 +1,282 @@
1
+ module Elasticity
2
+
3
+ class EMR
4
+
5
+ def initialize(aws_access_key_id, aws_secret_access_key, options = {})
6
+ @aws_request = Elasticity::AwsRequest.new(aws_access_key_id, aws_secret_access_key, options)
7
+ end
8
+
9
+ # Lists all jobflows in all states.
10
+ def describe_jobflows(params = {})
11
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(
12
+ params.merge({:operation => "DescribeJobFlows"}))
13
+ )
14
+ xml_doc = Nokogiri::XML(aws_result)
15
+ xml_doc.remove_namespaces!
16
+ yield aws_result if block_given?
17
+ JobFlow.from_members_nodeset(xml_doc.xpath("/DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member"))
18
+ end
19
+
20
+ # Adds a new group of instances to the specified jobflow. Elasticity maps a
21
+ # more Ruby-like syntax to the Amazon options. An exhaustive hash follows although
22
+ # not all of these options are required (or valid!) at once. Please see the
23
+ # EMR docs for details although even then you're going to need to experiment :)
24
+ #
25
+ # instance_group_config = {
26
+ # :bid_price => 5,
27
+ # :instance_count => 1,
28
+ # :instance_role => "TASK",
29
+ # :market => "SPOT",
30
+ # :name => "Go Canucks Go!"
31
+ # :type => "m1.small",
32
+ # }
33
+ #
34
+ # add_instance_groups takes an array of {}. Returns an array of the instance IDs
35
+ # that were created by the specified configs.
36
+ #
37
+ # ["ig-2GOVEN6HVJZID", "ig-1DU9M2UQMM051", "ig-3DZRW4Y2X4S", ...]
38
+ def add_instance_groups(jobflow_id, instance_group_configs)
39
+ params = {
40
+ :operation => "AddInstanceGroups",
41
+ :job_flow_id => jobflow_id,
42
+ :instance_groups => instance_group_configs
43
+ }
44
+ begin
45
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
46
+ xml_doc = Nokogiri::XML(aws_result)
47
+ xml_doc.remove_namespaces!
48
+ instance_group_ids = []
49
+ xml_doc.xpath("/AddInstanceGroupsResponse/AddInstanceGroupsResult/InstanceGroupIds/member").each do |member|
50
+ instance_group_ids << member.text
51
+ end
52
+ yield aws_result if block_given?
53
+ instance_group_ids
54
+ rescue RestClient::BadRequest => e
55
+ raise ArgumentError, EMR.parse_error_response(e.http_body)
56
+ end
57
+ end
58
+
59
+ # Add a step (or steps) to the specified job flow.
60
+ #
61
+ # emr.add_jobflow_step("j-123", {
62
+ # :steps => [
63
+ # {
64
+ # :action_on_failure => "TERMINATE_JOB_FLOW",
65
+ # :hadoop_jar_step => {
66
+ # :args => [
67
+ # "s3://elasticmapreduce/libs/pig/pig-script",
68
+ # "--base-path",
69
+ # "s3://elasticmapreduce/libs/pig/",
70
+ # "--install-pig"
71
+ # ],
72
+ # :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar"
73
+ # },
74
+ # :name => "Setup Pig"
75
+ # }
76
+ # ]
77
+ # })
78
+ def add_jobflow_steps(jobflow_id, steps_config)
79
+ params = {
80
+ :operation => "AddJobFlowSteps",
81
+ :job_flow_id => jobflow_id
82
+ }.merge!(steps_config)
83
+ begin
84
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
85
+ yield aws_result if block_given?
86
+ rescue RestClient::BadRequest => e
87
+ raise ArgumentError, EMR.parse_error_response(e.http_body)
88
+ end
89
+ end
90
+
91
+ # Set the number of instances in the specified instance groups to the
92
+ # specified counts. Note that this modifies the *request* count, which
93
+ # is not the same as the *running* count. I.e. you request instances
94
+ # and then wait for them to be created.
95
+ #
96
+ # Takes a {} of instance group IDs => desired instance count.
97
+ #
98
+ # {"ig-1" => 40, "ig-2" => 5, ...}
99
+ def modify_instance_groups(instance_group_config)
100
+ params = {
101
+ :operation => "ModifyInstanceGroups",
102
+ :instance_groups => instance_group_config.map { |k, v| {:instance_group_id => k, :instance_count => v} }
103
+ }
104
+ begin
105
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
106
+ yield aws_result if block_given?
107
+ rescue RestClient::BadRequest => e
108
+ raise ArgumentError, EMR.parse_error_response(e.http_body)
109
+ end
110
+ end
111
+
112
+ # Start a job flow with the specified configuration. This is a very thin
113
+ # wrapper around the AWS API, so in order to use it directly you'll need
114
+ # to have the PDF API reference handy, which can be found here:
115
+ #
116
+ # http://awsdocs.s3.amazonaws.com/ElasticMapReduce/20090331/emr-api-20090331.pdf
117
+ #
118
+ # Here is a sample job flow configuration that should help. This job flow
119
+ # starts by installing Pig then running a Pig script. It is based off of the
120
+ # Pig demo script from Amazon.
121
+ #
122
+ # emr.run_job_flow({
123
+ # :name => "Elasticity Test Flow (EMR Pig Script)",
124
+ # :instances => {
125
+ # :ec2_key_name => "sharethrough-dev",
126
+ # :hadoop_version => "0.20",
127
+ # :instance_count => 2,
128
+ # :master_instance_type => "m1.small",
129
+ # :placement => {
130
+ # :availability_zone => "us-east-1a"
131
+ # },
132
+ # :slave_instance_type => "m1.small",
133
+ # },
134
+ # :steps => [
135
+ # {
136
+ # :action_on_failure => "TERMINATE_JOB_FLOW",
137
+ # :hadoop_jar_step => {
138
+ # :args => [
139
+ # "s3://elasticmapreduce/libs/pig/pig-script",
140
+ # "--base-path",
141
+ # "s3://elasticmapreduce/libs/pig/",
142
+ # "--install-pig"
143
+ # ],
144
+ # :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar"
145
+ # },
146
+ # :name => "Setup Pig"
147
+ # },
148
+ # {
149
+ # :action_on_failure => "TERMINATE_JOB_FLOW",
150
+ # :hadoop_jar_step => {
151
+ # :args => [
152
+ # "s3://elasticmapreduce/libs/pig/pig-script",
153
+ # "--run-pig-script",
154
+ # "--args",
155
+ # "-p",
156
+ # "INPUT=s3n://elasticmapreduce/samples/pig-apache/input",
157
+ # "-p",
158
+ # "OUTPUT=s3n://slif-elasticity/pig-apache/output/2011-04-19",
159
+ # "s3n://elasticmapreduce/samples/pig-apache/do-reports.pig"
160
+ # ],
161
+ # :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar"
162
+ # },
163
+ # :name => "Run Pig Script"
164
+ # }
165
+ # ]
166
+ # })
167
+ def run_job_flow(job_flow_config)
168
+ params = {
169
+ :operation => "RunJobFlow",
170
+ }.merge!(job_flow_config)
171
+ begin
172
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
173
+ yield aws_result if block_given?
174
+ xml_doc = Nokogiri::XML(aws_result)
175
+ xml_doc.remove_namespaces!
176
+ xml_doc.xpath("/RunJobFlowResponse/RunJobFlowResult/JobFlowId").text
177
+ rescue RestClient::BadRequest => e
178
+ raise ArgumentError, EMR.parse_error_response(e.http_body)
179
+ end
180
+ end
181
+
182
+ # Enabled or disable "termination protection" on the specified job flows.
183
+ # Termination protection prevents a job flow from being terminated by a
184
+ # user initiated action, although the job flow will still terminate
185
+ # naturally.
186
+ #
187
+ # Takes an [] of job flow IDs.
188
+ #
189
+ # ["j-1B4D1XP0C0A35", "j-1YG2MYL0HVYS5", ...]
190
+ def set_termination_protection(jobflow_ids, protection_enabled=true)
191
+ params = {
192
+ :operation => "SetTerminationProtection",
193
+ :termination_protected => protection_enabled,
194
+ :job_flow_ids => jobflow_ids
195
+ }
196
+ begin
197
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
198
+ yield aws_result if block_given?
199
+ rescue RestClient::BadRequest => e
200
+ raise ArgumentError, EMR.parse_error_response(e.http_body)
201
+ end
202
+ end
203
+
204
+ # Terminate the specified jobflow. Amazon does not define a return value
205
+ # for this operation, so you'll need to poll #describe_jobflows to see
206
+ # the state of the jobflow. Raises ArgumentError if the specified job
207
+ # flow does not exist.
208
+ def terminate_jobflows(jobflow_id)
209
+ params = {
210
+ :operation => "TerminateJobFlows",
211
+ :job_flow_ids => [jobflow_id]
212
+ }
213
+ begin
214
+ aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
215
+ yield aws_result if block_given?
216
+ rescue RestClient::BadRequest
217
+ raise ArgumentError, "Job flow '#{jobflow_id}' does not exist."
218
+ end
219
+ end
220
+
221
+ # Pass the specified params hash directly through to the AWS request URL.
222
+ # Use this if you want to perform an operation that hasn't yet been wrapped
223
+ # by Elasticity or you just want to see the response XML for yourself :)
224
+ def direct(params)
225
+ @aws_request.aws_emr_request(params)
226
+ end
227
+
228
+ private
229
+
230
+ class << self
231
+
232
+ # AWS error responses all follow the same form. Extract the message from
233
+ # the error document.
234
+ def parse_error_response(error_xml)
235
+ xml_doc = Nokogiri::XML(error_xml)
236
+ xml_doc.remove_namespaces!
237
+ xml_doc.xpath("/ErrorResponse/Error/Message").text
238
+ end
239
+
240
+ # Since we use the same structure as AWS, we can generate AWS param names
241
+ # from the Ruby versions of those names (and the param nesting).
242
+ def convert_ruby_to_aws(params)
243
+ result = {}
244
+ params.each do |key, value|
245
+ case value
246
+ when Array
247
+ prefix = "#{camelize(key.to_s)}.member"
248
+ value.each_with_index do |item, index|
249
+ if item.is_a?(String)
250
+ result["#{prefix}.#{index+1}"] = item
251
+ else
252
+ convert_ruby_to_aws(item).each do |nested_key, nested_value|
253
+ result["#{prefix}.#{index+1}.#{nested_key}"] = nested_value
254
+ end
255
+ end
256
+ end
257
+ when Hash
258
+ prefix = "#{camelize(key.to_s)}"
259
+ convert_ruby_to_aws(value).each do |nested_key, nested_value|
260
+ result["#{prefix}.#{nested_key}"] = nested_value
261
+ end
262
+ else
263
+ result[camelize(key.to_s)] = value
264
+ end
265
+ end
266
+ result
267
+ end
268
+
269
+ # (Used from Rails' ActiveSupport)
270
+ def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
271
+ if first_letter_in_uppercase
272
+ lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::" + $1.upcase }.gsub(/(^|_)(.)/) { $2.upcase }
273
+ else
274
+ lower_case_and_underscored_word.first + camelize(lower_case_and_underscored_word)[1..-1]
275
+ end
276
+ end
277
+
278
+ end
279
+
280
+ end
281
+
282
+ end
@@ -0,0 +1,71 @@
1
+ module Elasticity
2
+
3
+ # HiveJob allows you quickly easily kick off a Hive jobflow without
4
+ # having to understand the entirety of the EMR API.
5
+ class HiveJob < Elasticity::SimpleJob
6
+
7
+ def initialize(aws_access_key_id, aws_secret_access_key)
8
+ super
9
+ @name = "Elasticity Hive Job"
10
+ end
11
+
12
+ # Run the specified Hive script with the specified variables.
13
+ #
14
+ # hive = Elasticity::HiveJob.new("access", "secret")
15
+ # jobflow_id = hive.run('s3n://slif-hive/test.q', {
16
+ # 'SCRIPTS' => 's3n://slif-test/scripts',
17
+ # 'OUTPUT' => 's3n://slif-test/output',
18
+ # 'XREFS' => 's3n://slif-test/xrefs'
19
+ # })
20
+ #
21
+ # The variables are accessible within your Hive scripts by using the
22
+ # standard ${NAME} syntax. E.g.
23
+ #
24
+ # ADD JAR ${SCRIPTS}/jsonserde.jar;
25
+ def run(hive_script, hive_variables={})
26
+ script_arguments = ["s3://elasticmapreduce/libs/hive/hive-script", "--run-hive-script", "--args"]
27
+ script_arguments.concat(["-f", hive_script])
28
+ hive_variables.each do |variable_name, value|
29
+ script_arguments.concat(["-d", "#{variable_name}=#{value}"])
30
+ end
31
+ jobflow_config = {
32
+ :name => @name,
33
+ :instances => {
34
+ :ec2_key_name => @ec2_key_name,
35
+ :hadoop_version => @hadoop_version,
36
+ :instance_count => @instance_count,
37
+ :master_instance_type => @master_instance_type,
38
+ :slave_instance_type => @slave_instance_type,
39
+ },
40
+ :steps => [
41
+ {
42
+ :action_on_failure => "TERMINATE_JOB_FLOW",
43
+ :hadoop_jar_step => {
44
+ :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar",
45
+ :args => [
46
+ "s3://elasticmapreduce/libs/hive/hive-script",
47
+ "--base-path", "s3://elasticmapreduce/libs/hive/",
48
+ "--install-hive"
49
+ ],
50
+ },
51
+ :name => "Setup Hive"
52
+ },
53
+ {
54
+ :action_on_failure => @action_on_failure,
55
+ :hadoop_jar_step => {
56
+ :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar",
57
+ :args => script_arguments,
58
+ },
59
+ :name => "Run Hive Script"
60
+ }
61
+ ]
62
+ }
63
+
64
+ jobflow_config.merge!(:log_uri => @log_uri) if @log_uri
65
+
66
+ @emr.run_job_flow(jobflow_config)
67
+ end
68
+
69
+ end
70
+
71
+ end