RubyGems - wakoopa-elasticity - Versions diffs - 1.2.3 - Mend

wakoopa-elasticity 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

data/.autotest +2 -0
data/.gitignore +5 -0
data/.rspec +2 -0
data/.rvmrc +1 -0
data/Gemfile +4 -0
data/HISTORY.mediawiki +30 -0
data/LICENSE +202 -0
data/README.mediawiki +332 -0
data/Rakefile +11 -0
data/elasticity.gemspec +29 -0
data/lib/elasticity.rb +16 -0
data/lib/elasticity/aws_request.rb +52 -0
data/lib/elasticity/emr.rb +282 -0
data/lib/elasticity/hive_job.rb +71 -0
data/lib/elasticity/job_flow.rb +53 -0
data/lib/elasticity/job_flow_step.rb +36 -0
data/lib/elasticity/pig_job.rb +112 -0
data/lib/elasticity/simple_job.rb +50 -0
data/lib/elasticity/version.rb +3 -0
data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_successful.yml +38 -0
data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_unsuccessful.yml +35 -0
data/spec/fixtures/vcr_cassettes/add_jobflow_steps/add_multiple_steps.yml +252 -0
data/spec/fixtures/vcr_cassettes/describe_jobflows/all_jobflows.yml +69 -0
data/spec/fixtures/vcr_cassettes/direct/terminate_jobflow.yml +32 -0
data/spec/fixtures/vcr_cassettes/hive_job/hive_ads.yml +35 -0
data/spec/fixtures/vcr_cassettes/modify_instance_groups/set_instances_to_3.yml +32 -0
data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports.yml +35 -0
data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports_with_bootstrap.yml +35 -0
data/spec/fixtures/vcr_cassettes/run_jobflow/word_count.yml +35 -0
data/spec/fixtures/vcr_cassettes/set_termination_protection/nonexistent_job_flows.yml +35 -0
data/spec/fixtures/vcr_cassettes/set_termination_protection/protect_multiple_job_flows.yml +32 -0
data/spec/fixtures/vcr_cassettes/terminate_jobflows/one_jobflow.yml +32 -0
data/spec/lib/elasticity/aws_request_spec.rb +62 -0
data/spec/lib/elasticity/emr_spec.rb +794 -0
data/spec/lib/elasticity/hive_job_spec.rb +96 -0
data/spec/lib/elasticity/job_flow_spec.rb +139 -0
data/spec/lib/elasticity/job_flow_step_spec.rb +76 -0
data/spec/lib/elasticity/pig_job_spec.rb +211 -0
data/spec/spec_helper.rb +43 -0
metadata +253 -0

data/Rakefile ADDED

@@ -0,0 +1,11 @@
+require 'bundler'
+Bundler::GemHelper.install_tasks
+require 'rake/testtask'
+require 'rspec/core/rake_task'
+desc 'Run specs'
+task :default => :spec
+desc "Run specs"
+RSpec::Core::RakeTask.new

data/elasticity.gemspec ADDED

@@ -0,0 +1,29 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "elasticity/version"
+Gem::Specification.new do |s|
+  s.name        = "wakoopa-elasticity"
+  s.version     = Elasticity::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Robert Slifka"]
+  s.homepage    = "http://www.github.com/rslifka/elasticity"
+  s.summary     = %q{Programmatic access to Amazon's Elastic Map Reduce service.}
+  s.description = %q{Programmatic access to Amazon's Elastic Map Reduce service.}
+  s.add_dependency("rest-client")
+  s.add_dependency("nokogiri")
+  s.add_development_dependency("autotest-fsevent")
+  s.add_development_dependency("autotest-growl")
+  s.add_development_dependency("rake")
+  s.add_development_dependency("rspec",   ">= 2.5.0")
+  s.add_development_dependency("vcr",     ">= 1.5.1")
+  s.add_development_dependency("webmock", ">= 1.6.2")
+  s.add_development_dependency("ZenTest")
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+end

data/lib/elasticity.rb ADDED

@@ -0,0 +1,16 @@
+require 'base64'
+require 'rest_client'
+require 'nokogiri'
+require 'elasticity/aws_request'
+require 'elasticity/emr'
+require 'elasticity/job_flow'
+require 'elasticity/job_flow_step'
+require 'elasticity/simple_job'
+require 'elasticity/hive_job'
+require 'elasticity/pig_job'
+module Elasticity
+end

data/lib/elasticity/aws_request.rb ADDED

@@ -0,0 +1,52 @@
+module Elasticity
+  class AwsRequest
+    def initialize(aws_access_key_id, aws_secret_access_key, options = {})
+      @access_key = aws_access_key_id
+      @secret_key = aws_secret_access_key
+      @options = {:secure => true}.merge(options)
+    end
+    def aws_emr_request(params)
+      host = @options[:region] ? "elasticmapreduce.#{@options[:region]}.amazonaws.com" : "elasticmapreduce.amazonaws.com"
+      protocol = @options[:secure] ? "https" : "http"
+      signed_params = sign_params(params, "GET", host, "/")
+      signed_request = "#{protocol}://#{host}?#{signed_params}"
+      RestClient.get signed_request
+    end
+    # (Used from RightScale's right_aws gem.)
+    # EC2, SQS, SDB and EMR requests must be signed by this guy.
+    # See: http://docs.amazonwebservices.com/AmazonSimpleDB/2007-11-07/DeveloperGuide/index.html?REST_RESTAuth.html
+    #      http://developer.amazonwebservices.com/connect/entry.jspa?externalID=1928
+    def sign_params(service_hash, http_verb, host, uri)
+      service_hash["AWSAccessKeyId"] = @access_key
+      service_hash["Timestamp"] = Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S.000Z")
+      service_hash["SignatureVersion"] = "2"
+      service_hash['SignatureMethod'] = 'HmacSHA256'
+      canonical_string = service_hash.keys.sort.map do |key|
+        "#{AwsRequest.aws_escape(key)}=#{AwsRequest.aws_escape(service_hash[key])}"
+      end.join('&')
+      string_to_sign = "#{http_verb.to_s.upcase}\n#{host.downcase}\n#{uri}\n#{canonical_string}"
+      signature = AwsRequest.aws_escape(Base64.encode64(OpenSSL::HMAC.digest("sha256", @secret_key, string_to_sign)).strip)
+      "#{canonical_string}&Signature=#{signature}"
+    end
+    class << self
+      # (Used from RightScale's right_aws gem)
+      # Escape a string according to Amazon's rules.
+      # See: http://docs.amazonwebservices.com/AmazonSimpleDB/2007-11-07/DeveloperGuide/index.html?REST_RESTAuth.html
+      def aws_escape(param)
+        param.to_s.gsub(/([^a-zA-Z0-9._~-]+)/n) do
+          '%' + $1.unpack('H2' * $1.size).join('%').upcase
+        end
+      end
+    end
+  end
+end

data/lib/elasticity/emr.rb ADDED

@@ -0,0 +1,282 @@
+module Elasticity
+  class EMR
+    def initialize(aws_access_key_id, aws_secret_access_key, options = {})
+      @aws_request = Elasticity::AwsRequest.new(aws_access_key_id, aws_secret_access_key, options)
+    end
+    # Lists all jobflows in all states.
+    def describe_jobflows(params = {})
+      aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(
+        params.merge({:operation => "DescribeJobFlows"}))
+      )
+      xml_doc = Nokogiri::XML(aws_result)
+      xml_doc.remove_namespaces!
+      yield aws_result if block_given?
+      JobFlow.from_members_nodeset(xml_doc.xpath("/DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member"))
+    end
+    # Adds a new group of instances to the specified jobflow.  Elasticity maps a
+    # more Ruby-like syntax to the Amazon options.  An exhaustive hash follows although
+    # not all of these options are required (or valid!) at once.  Please see the
+    # EMR docs for details although even then you're going to need to experiment :)
+    #
+    #   instance_group_config = {
+    #     :bid_price => 5,
+    #     :instance_count => 1,
+    #     :instance_role => "TASK",
+    #     :market => "SPOT",
+    #     :name => "Go Canucks Go!"
+    #     :type => "m1.small",
+    #   }
+    #
+    # add_instance_groups takes an array of {}.  Returns an array of the instance IDs
+    # that were created by the specified configs.
+    #
+    #   ["ig-2GOVEN6HVJZID", "ig-1DU9M2UQMM051", "ig-3DZRW4Y2X4S", ...]
+    def add_instance_groups(jobflow_id, instance_group_configs)
+      params = {
+        :operation => "AddInstanceGroups",
+        :job_flow_id => jobflow_id,
+        :instance_groups => instance_group_configs
+      }
+      begin
+        aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
+        xml_doc = Nokogiri::XML(aws_result)
+        xml_doc.remove_namespaces!
+        instance_group_ids = []
+        xml_doc.xpath("/AddInstanceGroupsResponse/AddInstanceGroupsResult/InstanceGroupIds/member").each do |member|
+          instance_group_ids << member.text
+        end
+        yield aws_result if block_given?
+        instance_group_ids
+      rescue RestClient::BadRequest => e
+        raise ArgumentError, EMR.parse_error_response(e.http_body)
+      end
+    end
+    # Add a step (or steps) to the specified job flow.
+    #
+    #   emr.add_jobflow_step("j-123", {
+    #     :steps => [
+    #       {
+    #         :action_on_failure => "TERMINATE_JOB_FLOW",
+    #         :hadoop_jar_step => {
+    #           :args => [
+    #             "s3://elasticmapreduce/libs/pig/pig-script",
+    #               "--base-path",
+    #               "s3://elasticmapreduce/libs/pig/",
+    #               "--install-pig"
+    #           ],
+    #           :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar"
+    #         },
+    #         :name => "Setup Pig"
+    #       }
+    #     ]
+    #   })
+    def add_jobflow_steps(jobflow_id, steps_config)
+      params = {
+        :operation => "AddJobFlowSteps",
+        :job_flow_id => jobflow_id
+      }.merge!(steps_config)
+      begin
+        aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
+        yield aws_result if block_given?
+      rescue RestClient::BadRequest => e
+        raise ArgumentError, EMR.parse_error_response(e.http_body)
+      end
+    end
+    # Set the number of instances in the specified instance groups to the
+    # specified counts.  Note that this modifies the *request* count, which
+    # is not the same as the *running* count.  I.e. you request instances
+    # and then wait for them to be created.
+    #
+    # Takes a {} of instance group IDs => desired instance count.
+    #
+    #   {"ig-1" => 40, "ig-2" => 5, ...}
+    def modify_instance_groups(instance_group_config)
+      params = {
+        :operation => "ModifyInstanceGroups",
+        :instance_groups => instance_group_config.map { |k, v| {:instance_group_id => k, :instance_count => v} }
+      }
+      begin
+        aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
+        yield aws_result if block_given?
+      rescue RestClient::BadRequest => e
+        raise ArgumentError, EMR.parse_error_response(e.http_body)
+      end
+    end
+    # Start a job flow with the specified configuration.  This is a very thin
+    # wrapper around the AWS API, so in order to use it directly you'll need
+    # to have the PDF API reference handy, which can be found here:
+    #
+    # http://awsdocs.s3.amazonaws.com/ElasticMapReduce/20090331/emr-api-20090331.pdf
+    #
+    # Here is a sample job flow configuration that should help.  This job flow
+    # starts by installing Pig then running a Pig script.  It is based off of the
+    # Pig demo script from Amazon.
+    #
+    #   emr.run_job_flow({
+    #     :name => "Elasticity Test Flow (EMR Pig Script)",
+    #     :instances => {
+    #       :ec2_key_name => "sharethrough-dev",
+    #       :hadoop_version => "0.20",
+    #       :instance_count => 2,
+    #       :master_instance_type => "m1.small",
+    #       :placement => {
+    #         :availability_zone => "us-east-1a"
+    #       },
+    #       :slave_instance_type => "m1.small",
+    #     },
+    #     :steps => [
+    #       {
+    #         :action_on_failure => "TERMINATE_JOB_FLOW",
+    #         :hadoop_jar_step => {
+    #           :args => [
+    #             "s3://elasticmapreduce/libs/pig/pig-script",
+    #               "--base-path",
+    #               "s3://elasticmapreduce/libs/pig/",
+    #               "--install-pig"
+    #           ],
+    #           :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar"
+    #         },
+    #         :name => "Setup Pig"
+    #       },
+    #         {
+    #           :action_on_failure => "TERMINATE_JOB_FLOW",
+    #           :hadoop_jar_step => {
+    #             :args => [
+    #               "s3://elasticmapreduce/libs/pig/pig-script",
+    #                 "--run-pig-script",
+    #                 "--args",
+    #                 "-p",
+    #                 "INPUT=s3n://elasticmapreduce/samples/pig-apache/input",
+    #                 "-p",
+    #                 "OUTPUT=s3n://slif-elasticity/pig-apache/output/2011-04-19",
+    #                 "s3n://elasticmapreduce/samples/pig-apache/do-reports.pig"
+    #             ],
+    #             :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar"
+    #           },
+    #           :name => "Run Pig Script"
+    #         }
+    #     ]
+    #   })
+    def run_job_flow(job_flow_config)
+      params = {
+        :operation => "RunJobFlow",
+      }.merge!(job_flow_config)
+      begin
+        aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
+        yield aws_result if block_given?
+        xml_doc = Nokogiri::XML(aws_result)
+        xml_doc.remove_namespaces!
+        xml_doc.xpath("/RunJobFlowResponse/RunJobFlowResult/JobFlowId").text
+      rescue RestClient::BadRequest => e
+        raise ArgumentError, EMR.parse_error_response(e.http_body)
+      end
+    end
+    # Enabled or disable "termination protection" on the specified job flows.
+    # Termination protection prevents a job flow from being terminated by a
+    # user initiated action, although the job flow will still terminate
+    # naturally.
+    #
+    # Takes an [] of job flow IDs.
+    #
+    #   ["j-1B4D1XP0C0A35", "j-1YG2MYL0HVYS5", ...]
+    def set_termination_protection(jobflow_ids, protection_enabled=true)
+      params = {
+        :operation => "SetTerminationProtection",
+        :termination_protected => protection_enabled,
+        :job_flow_ids => jobflow_ids
+      }
+      begin
+        aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
+        yield aws_result if block_given?
+      rescue RestClient::BadRequest => e
+        raise ArgumentError, EMR.parse_error_response(e.http_body)
+      end
+    end
+    # Terminate the specified jobflow.  Amazon does not define a return value
+    # for this operation, so you'll need to poll #describe_jobflows to see
+    # the state of the jobflow.  Raises ArgumentError if the specified job
+    # flow does not exist.
+    def terminate_jobflows(jobflow_id)
+      params = {
+        :operation => "TerminateJobFlows",
+        :job_flow_ids => [jobflow_id]
+      }
+      begin
+        aws_result = @aws_request.aws_emr_request(EMR.convert_ruby_to_aws(params))
+        yield aws_result if block_given?
+      rescue RestClient::BadRequest
+        raise ArgumentError, "Job flow '#{jobflow_id}' does not exist."
+      end
+    end
+    # Pass the specified params hash directly through to the AWS request URL.
+    # Use this if you want to perform an operation that hasn't yet been wrapped
+    # by Elasticity or you just want to see the response XML for yourself :)
+    def direct(params)
+      @aws_request.aws_emr_request(params)
+    end
+    private
+    class << self
+      # AWS error responses all follow the same form.  Extract the message from
+      # the error document.
+      def parse_error_response(error_xml)
+        xml_doc = Nokogiri::XML(error_xml)
+        xml_doc.remove_namespaces!
+        xml_doc.xpath("/ErrorResponse/Error/Message").text
+      end
+      # Since we use the same structure as AWS, we can generate AWS param names
+      # from the Ruby versions of those names (and the param nesting).
+      def convert_ruby_to_aws(params)
+        result = {}
+        params.each do |key, value|
+          case value
+            when Array
+              prefix = "#{camelize(key.to_s)}.member"
+              value.each_with_index do |item, index|
+                if item.is_a?(String)
+                  result["#{prefix}.#{index+1}"] = item
+                else
+                  convert_ruby_to_aws(item).each do |nested_key, nested_value|
+                    result["#{prefix}.#{index+1}.#{nested_key}"] = nested_value
+                  end
+                end
+              end
+            when Hash
+              prefix = "#{camelize(key.to_s)}"
+              convert_ruby_to_aws(value).each do |nested_key, nested_value|
+                result["#{prefix}.#{nested_key}"] = nested_value
+              end
+            else
+              result[camelize(key.to_s)] = value
+          end
+        end
+        result
+      end
+      # (Used from Rails' ActiveSupport)
+      def camelize(lower_case_and_underscored_word, first_letter_in_uppercase = true)
+        if first_letter_in_uppercase
+          lower_case_and_underscored_word.to_s.gsub(/\/(.?)/) { "::" + $1.upcase }.gsub(/(^|_)(.)/) { $2.upcase }
+        else
+          lower_case_and_underscored_word.first + camelize(lower_case_and_underscored_word)[1..-1]
+        end
+      end
+    end
+  end
+end

data/lib/elasticity/hive_job.rb ADDED

@@ -0,0 +1,71 @@
+module Elasticity
+  # HiveJob allows you quickly easily kick off a Hive jobflow without
+  # having to understand the entirety of the EMR API.
+  class HiveJob < Elasticity::SimpleJob
+    def initialize(aws_access_key_id, aws_secret_access_key)
+      super
+      @name = "Elasticity Hive Job"
+    end
+    # Run the specified Hive script with the specified variables.
+    #
+    #   hive = Elasticity::HiveJob.new("access", "secret")
+    #   jobflow_id = hive.run('s3n://slif-hive/test.q', {
+    #     'SCRIPTS' => 's3n://slif-test/scripts',
+    #     'OUTPUT'  => 's3n://slif-test/output',
+    #     'XREFS'   => 's3n://slif-test/xrefs'
+    #   })
+    #
+    # The variables are accessible within your Hive scripts by using the
+    # standard ${NAME} syntax.  E.g.
+    #
+    #   ADD JAR ${SCRIPTS}/jsonserde.jar;
+    def run(hive_script, hive_variables={})
+      script_arguments = ["s3://elasticmapreduce/libs/hive/hive-script", "--run-hive-script", "--args"]
+      script_arguments.concat(["-f", hive_script])
+      hive_variables.each do |variable_name, value|
+        script_arguments.concat(["-d", "#{variable_name}=#{value}"])
+      end
+      jobflow_config = {
+        :name => @name,
+        :instances => {
+          :ec2_key_name => @ec2_key_name,
+          :hadoop_version => @hadoop_version,
+          :instance_count => @instance_count,
+          :master_instance_type => @master_instance_type,
+          :slave_instance_type => @slave_instance_type,
+        },
+        :steps => [
+          {
+            :action_on_failure => "TERMINATE_JOB_FLOW",
+            :hadoop_jar_step => {
+              :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar",
+              :args => [
+                "s3://elasticmapreduce/libs/hive/hive-script",
+                  "--base-path", "s3://elasticmapreduce/libs/hive/",
+                  "--install-hive"
+              ],
+            },
+            :name => "Setup Hive"
+          },
+            {
+              :action_on_failure => @action_on_failure,
+              :hadoop_jar_step => {
+                :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar",
+                :args => script_arguments,
+              },
+              :name => "Run Hive Script"
+            }
+        ]
+      }
+      jobflow_config.merge!(:log_uri => @log_uri) if @log_uri
+      @emr.run_job_flow(jobflow_config)
+    end
+  end
+end