RubyGems - wakoopa-elasticity - Versions diffs - 1.2.3 - Mend

wakoopa-elasticity 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

data/.autotest +2 -0
data/.gitignore +5 -0
data/.rspec +2 -0
data/.rvmrc +1 -0
data/Gemfile +4 -0
data/HISTORY.mediawiki +30 -0
data/LICENSE +202 -0
data/README.mediawiki +332 -0
data/Rakefile +11 -0
data/elasticity.gemspec +29 -0
data/lib/elasticity.rb +16 -0
data/lib/elasticity/aws_request.rb +52 -0
data/lib/elasticity/emr.rb +282 -0
data/lib/elasticity/hive_job.rb +71 -0
data/lib/elasticity/job_flow.rb +53 -0
data/lib/elasticity/job_flow_step.rb +36 -0
data/lib/elasticity/pig_job.rb +112 -0
data/lib/elasticity/simple_job.rb +50 -0
data/lib/elasticity/version.rb +3 -0
data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_successful.yml +38 -0
data/spec/fixtures/vcr_cassettes/add_instance_groups/one_group_unsuccessful.yml +35 -0
data/spec/fixtures/vcr_cassettes/add_jobflow_steps/add_multiple_steps.yml +252 -0
data/spec/fixtures/vcr_cassettes/describe_jobflows/all_jobflows.yml +69 -0
data/spec/fixtures/vcr_cassettes/direct/terminate_jobflow.yml +32 -0
data/spec/fixtures/vcr_cassettes/hive_job/hive_ads.yml +35 -0
data/spec/fixtures/vcr_cassettes/modify_instance_groups/set_instances_to_3.yml +32 -0
data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports.yml +35 -0
data/spec/fixtures/vcr_cassettes/pig_job/apache_log_reports_with_bootstrap.yml +35 -0
data/spec/fixtures/vcr_cassettes/run_jobflow/word_count.yml +35 -0
data/spec/fixtures/vcr_cassettes/set_termination_protection/nonexistent_job_flows.yml +35 -0
data/spec/fixtures/vcr_cassettes/set_termination_protection/protect_multiple_job_flows.yml +32 -0
data/spec/fixtures/vcr_cassettes/terminate_jobflows/one_jobflow.yml +32 -0
data/spec/lib/elasticity/aws_request_spec.rb +62 -0
data/spec/lib/elasticity/emr_spec.rb +794 -0
data/spec/lib/elasticity/hive_job_spec.rb +96 -0
data/spec/lib/elasticity/job_flow_spec.rb +139 -0
data/spec/lib/elasticity/job_flow_step_spec.rb +76 -0
data/spec/lib/elasticity/pig_job_spec.rb +211 -0
data/spec/spec_helper.rb +43 -0
metadata +253 -0

data/spec/lib/elasticity/hive_job_spec.rb ADDED

@@ -0,0 +1,96 @@
+require 'spec_helper'
+describe Elasticity::HiveJob do
+  describe ".new" do
+    it "should have good defaults" do
+      hive = Elasticity::HiveJob.new("access", "secret")
+      hive.aws_access_key_id.should == "access"
+      hive.aws_secret_access_key.should == "secret"
+      hive.ec2_key_name.should == "default"
+      hive.hadoop_version.should == "0.20"
+      hive.instance_count.should == 2
+      hive.master_instance_type.should == "m1.small"
+      hive.name.should == "Elasticity Hive Job"
+      hive.slave_instance_type.should == "m1.small"
+      hive.action_on_failure.should == "TERMINATE_JOB_FLOW"
+      hive.log_uri.should == nil
+    end
+  end
+  describe "#run" do
+    it "should run the script with the specified variables and return the jobflow_id" do
+      aws = Elasticity::EMR.new("", "")
+      aws.should_receive(:run_job_flow).with({
+        :name => "Elasticity Hive Job",
+        :log_uri => "s3n://slif-test/output/logs",
+        :instances => {
+          :ec2_key_name => "default",
+          :hadoop_version => "0.20",
+          :instance_count => 2,
+          :master_instance_type => "m1.small",
+          :slave_instance_type => "m1.small",
+        },
+        :steps => [
+          {
+            :action_on_failure => "TERMINATE_JOB_FLOW",
+            :hadoop_jar_step => {
+              :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar",
+              :args => [
+                "s3://elasticmapreduce/libs/hive/hive-script",
+                  "--base-path",
+                  "s3://elasticmapreduce/libs/hive/",
+                  "--install-hive"
+              ],
+            },
+            :name => "Setup Hive"
+          },
+            {
+              :action_on_failure => "CONTINUE",
+              :hadoop_jar_step => {
+                :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar",
+                :args => [
+                  "s3://elasticmapreduce/libs/hive/hive-script",
+                    "--run-hive-script",
+                    "--args",
+                    "-f", "s3n://slif-hive/test.q",
+                    "-d", "XREFS=s3n://slif-test/xrefs",
+                    "-d", "OUTPUT=s3n://slif-test/output"
+                ],
+              },
+              :name => "Run Hive Script"
+            }
+        ]
+      }).and_return("new_jobflow_id")
+      Elasticity::EMR.should_receive(:new).with("access", "secret").and_return(aws)
+      hive = Elasticity::HiveJob.new("access", "secret")
+      hive.log_uri = "s3n://slif-test/output/logs"
+      hive.action_on_failure = "CONTINUE"
+      jobflow_id = hive.run('s3n://slif-hive/test.q', {
+        'OUTPUT' => 's3n://slif-test/output',
+        'XREFS' => 's3n://slif-test/xrefs'
+      })
+      jobflow_id.should == "new_jobflow_id"
+    end
+  end
+  describe "integration happy path" do
+    use_vcr_cassette "hive_job/hive_ads", :record => :none
+    it "should kick off the sample Amazion EMR Hive application" do
+      hive = Elasticity::HiveJob.new(AWS_ACCESS_KEY_ID, AWS_SECRET_KEY)
+      hive.ec2_key_name = "sharethrough_dev"
+      jobflow_id = hive.run("s3n://elasticmapreduce/samples/hive-ads/libs/model-build.q", {
+        "LIBS"   => "s3n://elasticmapreduce/samples/hive-ads/libs",
+        "INPUT"  => "s3n://elasticmapreduce/samples/hive-ads/tables",
+        "OUTPUT" => "s3n://slif-elasticity/hive-ads/output/2011-04-19"
+      })
+      jobflow_id.should == "j-1UUVYMHBLKEGN"
+    end
+  end
+end

data/spec/lib/elasticity/job_flow_spec.rb ADDED

@@ -0,0 +1,139 @@
+require 'spec_helper'
+describe Elasticity::JobFlow do
+  before do
+    describe_jobflows_xml = <<-JOBFLOWS
+      <DescribeJobFlowsResponse xmlns="http://elasticmapreduce.amazonaws.com/doc/2009-03-31">
+        <DescribeJobFlowsResult>
+          <JobFlows>
+            <member>
+              <JobFlowId>j-p</JobFlowId>
+              <Name>Pig Job</Name>
+              <ExecutionStatusDetail>
+                <CreationDateTime>
+                   2011-10-04T21:49:16Z
+                </CreationDateTime>
+                <StartDateTime>
+                   2011-10-04T21:49:17Z
+                </StartDateTime>
+                <ReadyDateTime>
+                   2011-10-04T21:49:18Z
+                </ReadyDateTime>
+                <State>TERMINATED</State>
+              </ExecutionStatusDetail>
+              <Steps>
+                <member>
+                  <StepConfig>
+                    <Name>Setup Hive</Name>
+                  </StepConfig>
+                  <ExecutionStatusDetail>
+                    <State>FAILED</State>
+                  </ExecutionStatusDetail>
+                </member>
+                <member>
+                  <StepConfig>
+                    <Name>Run Hive Script</Name>
+                  </StepConfig>
+                  <ExecutionStatusDetail>
+                    <State>PENDING</State>
+                  </ExecutionStatusDetail>
+                </member>
+              </Steps>
+              <Instances>
+                 <Placement>
+                    <AvailabilityZone>
+                      eu-west-1a
+                    </AvailabilityZone>
+                 </Placement>
+                 <SlaveInstanceType>
+                    m1.small
+                 </SlaveInstanceType>
+                 <MasterInstanceType>
+                    m1.small
+                 </MasterInstanceType>
+                 <Ec2KeyName>
+                    myec2keyname
+                 </Ec2KeyName>
+                 <InstanceCount>
+                    4
+                 </InstanceCount>
+              </Instances>
+            </member>
+            <member>
+              <JobFlowId>j-h</JobFlowId>
+              <Name>Hive Job</Name>
+              <ExecutionStatusDetail>
+                <CreationDateTime>
+                   2011-10-04T22:49:16Z
+                </CreationDateTime>
+                <StartDateTime>
+                   2011-10-04T22:49:17Z
+                </StartDateTime>
+                <ReadyDateTime>
+                   2011-10-04T22:49:18Z
+                </ReadyDateTime>
+                <State>TERMINATED</State>
+              </ExecutionStatusDetail>
+              <Instances>
+                 <Placement>
+                    <AvailabilityZone>
+                      eu-west-1b
+                    </AvailabilityZone>
+                 </Placement>
+                 <SlaveInstanceType>
+                    c1.medium
+                 </SlaveInstanceType>
+                 <MasterInstanceType>
+                    c1.medium
+                 </MasterInstanceType>
+                 <Ec2KeyName>
+                    myec2keyname
+                 </Ec2KeyName>
+                 <InstanceCount>
+                    2
+                 </InstanceCount>
+              </Instances>
+            </member>
+          </JobFlows>
+        </DescribeJobFlowsResult>
+      </DescribeJobFlowsResponse>
+    JOBFLOWS
+    describe_jobflows_document = Nokogiri::XML(describe_jobflows_xml)
+    describe_jobflows_document.remove_namespaces!
+    @members_nodeset = describe_jobflows_document.xpath('/DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member')
+  end
+  describe ".from_xml" do
+    it "should return a JobFlow with the appropriate fields initialized" do
+      jobflow = Elasticity::JobFlow.from_member_element(@members_nodeset[0])
+      jobflow.name.should == "Pig Job"
+      jobflow.jobflow_id.should == "j-p"
+      jobflow.state.should == "TERMINATED"
+      jobflow.steps.map(&:name).should == ["Setup Hive", "Run Hive Script"]
+      jobflow.steps.map(&:state).should == ["FAILED", "PENDING"]
+      jobflow.created_at.should == "2011-10-04T21:49:16Z"
+      jobflow.started_at.should == "2011-10-04T21:49:17Z"
+      jobflow.ready_at.should == "2011-10-04T21:49:18Z"
+      jobflow.master_instance_type.should == "m1.small"
+      jobflow.slave_instance_type.should == "m1.small"
+      jobflow.instance_count.should == "4"
+    end
+  end
+  describe ".from_jobflows_nodeset" do
+    it "should return JobFlows with the appropriate fields initialized" do
+      jobflow = Elasticity::JobFlow.from_members_nodeset(@members_nodeset)
+      jobflow.map(&:name).should == ["Pig Job", "Hive Job"]
+      jobflow.map(&:jobflow_id).should == ["j-p", "j-h"]
+      jobflow.map(&:state).should == ["TERMINATED", "TERMINATED"]
+      jobflow.map(&:created_at).should == ["2011-10-04T21:49:16Z","2011-10-04T22:49:16Z"]
+      jobflow.map(&:started_at).should == ["2011-10-04T21:49:17Z","2011-10-04T22:49:17Z"]
+      jobflow.map(&:ready_at).should == ["2011-10-04T21:49:18Z","2011-10-04T22:49:18Z"]
+      jobflow.map(&:master_instance_type).should == ["m1.small","c1.medium"]
+      jobflow.map(&:slave_instance_type).should == ["m1.small", "c1.medium"]
+      jobflow.map(&:instance_count).should == ["4","2"]
+    end
+  end
+end

data/spec/lib/elasticity/job_flow_step_spec.rb ADDED

@@ -0,0 +1,76 @@
+require 'spec_helper'
+describe Elasticity::JobFlowStep do
+  before do
+    describe_jobflows_xml = <<-JOBFLOWS
+      <DescribeJobFlowsResponse xmlns="http://elasticmapreduce.amazonaws.com/doc/2009-03-31">
+        <DescribeJobFlowsResult>
+          <JobFlows>
+            <member>
+              <JobFlowId>j-p</JobFlowId>
+              <Name>Pig Job</Name>
+              <ExecutionStatusDetail>
+                <State>TERMINATED</State>
+              </ExecutionStatusDetail>
+              <Steps>
+                <member>
+                  <StepConfig>
+                    <Name>Setup Hive</Name>
+                  </StepConfig>
+                  <ExecutionStatusDetail>
+                    <State>FAILED</State>
+                    <StartDateTime>
+                       2011-10-04T21:49:16Z
+                    </StartDateTime>
+                    <EndDateTime>
+                       2011-10-04T21:51:16Z
+                    </EndDateTime>
+                  </ExecutionStatusDetail>
+                </member>
+                <member>
+                  <StepConfig>
+                    <Name>Run Hive Script</Name>
+                  </StepConfig>
+                  <ExecutionStatusDetail>
+                    <State>PENDING</State>
+                    <StartDateTime>
+                       2011-10-04T21:51:18Z
+                    </StartDateTime>
+                    <EndDateTime>
+                    </EndDateTime>
+                  </ExecutionStatusDetail>
+                </member>
+              </Steps>
+            </member>
+          </JobFlows>
+        </DescribeJobFlowsResult>
+      </DescribeJobFlowsResponse>
+    JOBFLOWS
+    describe_jobflows_document = Nokogiri::XML(describe_jobflows_xml)
+    describe_jobflows_document.remove_namespaces!
+    @members_nodeset = describe_jobflows_document.xpath('/DescribeJobFlowsResponse/DescribeJobFlowsResult/JobFlows/member/Steps/member')
+  end
+  describe ".from_xml" do
+    it "should return a JobFlowStep with the appropriate fields initialized" do
+      jobflow_step = Elasticity::JobFlowStep.from_member_element(@members_nodeset[0])
+      jobflow_step.name.should == "Setup Hive"
+      jobflow_step.state.should == "FAILED"
+      jobflow_step.started_at.should == "2011-10-04T21:49:16Z"
+      jobflow_step.ended_at.should == "2011-10-04T21:51:16Z"
+    end
+  end
+  describe ".from_steps_nodeset" do
+    it "should return JobFlowSteps with the appropriate fields initialized" do
+      jobflow_steps = Elasticity::JobFlowStep.from_members_nodeset(@members_nodeset)
+      jobflow_steps.map(&:name).should == ["Setup Hive", "Run Hive Script"]
+      jobflow_steps.map(&:state).should == ["FAILED", "PENDING"]
+      jobflow_steps.map(&:started_at).should == ["2011-10-04T21:49:16Z", "2011-10-04T21:51:18Z"]
+      jobflow_steps.map(&:ended_at).should == ["2011-10-04T21:51:16Z", ""]
+    end
+  end
+end

data/spec/lib/elasticity/pig_job_spec.rb ADDED

@@ -0,0 +1,211 @@
+require 'spec_helper'
+describe Elasticity::PigJob do
+  describe ".new" do
+    it "should have good defaults" do
+      pig = Elasticity::PigJob.new("access", "secret")
+      pig.aws_access_key_id.should == "access"
+      pig.aws_secret_access_key.should == "secret"
+      pig.ec2_key_name.should == "default"
+      pig.hadoop_version.should == "0.20"
+      pig.instance_count.should == 2
+      pig.master_instance_type.should == "m1.small"
+      pig.name.should == "Elasticity Pig Job"
+      pig.slave_instance_type.should == "m1.small"
+      pig.action_on_failure.should == "TERMINATE_JOB_FLOW"
+      pig.log_uri.should == nil
+      pig.parallels.should == 1
+    end
+  end
+  describe "#instance_count=" do
+    it "should not allow instances to be set less than 2" do
+      pig = Elasticity::PigJob.new("access", "secret")
+      lambda {
+        pig.instance_count = 1
+      }.should raise_error(ArgumentError, "Instance count cannot be set to less than 2 (requested 1)")
+    end
+  end
+  describe "calculated value of parallels" do
+    before do
+      @pig = Elasticity::PigJob.new("access", "secret")
+      @pig.instance_count = 8
+    end
+    context "when slave is m1.small" do
+      it "should be 7" do
+        @pig.slave_instance_type = "m1.small"
+        @pig.parallels.should == 7
+      end
+    end
+    context "when slave is m1.large" do
+      it "should be 13" do
+        @pig.slave_instance_type = "m1.large"
+        @pig.parallels.should == 13
+      end
+    end
+    context "when slave is c1.medium" do
+      it "should be 13" do
+        @pig.slave_instance_type = "c1.medium"
+        @pig.parallels.should == 13
+      end
+    end
+    context "when slave is m1.xlarge" do
+      it "should be 26" do
+        @pig.slave_instance_type = "m1.xlarge"
+        @pig.parallels.should == 26
+      end
+    end
+    context "when slave is c1.xlarge" do
+      it "should be 26" do
+        @pig.slave_instance_type = "c1.xlarge"
+        @pig.parallels.should == 26
+      end
+    end
+    context "when slave is any other type" do
+      it "should be 1" do
+        @pig.slave_instance_type = "foo"
+        @pig.parallels.should == 7
+      end
+    end
+  end
+  describe "#run" do
+    context "when no bootstrap actions are specified" do
+      it "should run the script with the specified variables and return the jobflow_id" do
+        aws = Elasticity::EMR.new("", "")
+        aws.should_receive(:run_job_flow).with({
+          :name => "Elasticity Pig Job",
+          :log_uri => "s3n://slif-test/output/logs",
+          :instances => {
+            :ec2_key_name => "default",
+            :hadoop_version => "0.20",
+            :instance_count => 8,
+            :master_instance_type => "m1.small",
+            :slave_instance_type => "m1.xlarge",
+          },
+          :steps => [
+            {
+              :action_on_failure => "TERMINATE_JOB_FLOW",
+              :hadoop_jar_step => {
+                :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar",
+                :args => [
+                  "s3://elasticmapreduce/libs/pig/pig-script",
+                    "--base-path",
+                    "s3://elasticmapreduce/libs/pig/",
+                    "--install-pig"
+                ],
+              },
+              :name => "Setup Pig"
+            },
+              {
+                :action_on_failure => "CONTINUE",
+                :hadoop_jar_step => {
+                  :jar => "s3://elasticmapreduce/libs/script-runner/script-runner.jar",
+                  :args => [
+                    "s3://elasticmapreduce/libs/pig/pig-script",
+                      "--run-pig-script",
+                      "--args",
+                      "-p", "OUTPUT=s3n://slif-pig-test/output",
+                      "-p", "XREFS=s3n://slif-pig-test/xrefs",
+                      "-p", "E_PARALLELS=26",
+                      "s3n://slif-pig-test/test.pig"
+                  ],
+                },
+                :name => "Run Pig Script"
+              }
+          ]
+        }).and_return("new_jobflow_id")
+        Elasticity::EMR.should_receive(:new).with("access", "secret").and_return(aws)
+        pig = Elasticity::PigJob.new("access", "secret")
+        pig.log_uri = "s3n://slif-test/output/logs"
+        pig.action_on_failure = "CONTINUE"
+        pig.instance_count = 8
+        pig.slave_instance_type = "m1.xlarge"
+        jobflow_id = pig.run('s3n://slif-pig-test/test.pig', {
+          'OUTPUT' => 's3n://slif-pig-test/output',
+          'XREFS' => 's3n://slif-pig-test/xrefs'
+        })
+        jobflow_id.should == "new_jobflow_id"
+      end
+    end
+    context "when bootstrap actions are specified" do
+      it "should run the script wth the proper job configuration" do
+        aws = Elasticity::EMR.new("", "")
+        aws.should_receive(:run_job_flow).with(hash_including({
+          :bootstrap_actions => [
+            {
+              :name => "Elasticity Bootstrap Action (Configure Hadoop)",
+              :script_bootstrap_action => {
+                :path => "s3n://elasticmapreduce/bootstrap-actions/configure-hadoop",
+                :args => ["-m", "foo=111"]
+              }
+            },
+            {
+              :name => "Elasticity Bootstrap Action (Configure Hadoop)",
+              :script_bootstrap_action => {
+                :path => "s3n://elasticmapreduce/bootstrap-actions/configure-hadoop",
+                :args => ["-m", "bar=222"]
+              }
+            }
+          ],
+        }))
+        Elasticity::EMR.should_receive(:new).with("access", "secret").and_return(aws)
+        pig = Elasticity::PigJob.new("access", "secret")
+        pig.add_hadoop_bootstrap_action("-m", "foo=111")
+        pig.add_hadoop_bootstrap_action("-m", "bar=222")
+        pig.run('s3n://slif-pig-test/test.pig')
+      end
+    end
+  end
+  describe "integration happy path" do
+    context "with bootstrap actions" do
+      use_vcr_cassette "pig_job/apache_log_reports_with_bootstrap", :record => :none
+      it "should kick off the sample Amazion EMR Pig application" do
+        pig = Elasticity::PigJob.new(AWS_ACCESS_KEY_ID, AWS_SECRET_KEY)
+        pig.ec2_key_name = "sharethrough_dev"
+        pig.add_hadoop_bootstrap_action("-m", "mapred.job.reuse.jvm.num.tasks=120")
+        jobflow_id = pig.run("s3n://elasticmapreduce/samples/pig-apache/do-reports.pig", {
+          "INPUT" => "s3n://elasticmapreduce/samples/pig-apache/input",
+          "OUTPUT" => "s3n://slif-elasticity/pig-apache/output/2011-05-10"
+        })
+        jobflow_id.should == "j-1UK43AWRT3QHD"
+      end
+    end
+    context "without bootstrap actions" do
+      use_vcr_cassette "pig_job/apache_log_reports", :record => :none
+      it "should kick off the sample Amazion EMR Pig application" do
+        pig = Elasticity::PigJob.new(AWS_ACCESS_KEY_ID, AWS_SECRET_KEY)
+        pig.log_uri = "s3n://slif-elasticity/pig-apache/logs"
+        pig.ec2_key_name = "sharethrough_dev"
+        jobflow_id = pig.run("s3n://elasticmapreduce/samples/pig-apache/do-reports.pig", {
+          "INPUT" => "s3n://elasticmapreduce/samples/pig-apache/input",
+          "OUTPUT" => "s3n://slif-elasticity/pig-apache/output/2011-05-04"
+        })
+        jobflow_id.should == "j-1HB7A3TBRT3VS"
+      end
+    end
+  end
+end