elasticity 1.2 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/HISTORY.mediawiki CHANGED
@@ -1,3 +1,7 @@
1
+ === 1.2.1 ===
2
+
3
+ * Shipping up E_PARALLELS Pig variable with each invocation; reasonable default value for PARALLEL based on the number and type of instances configured.
4
+
1
5
  === 1.2 ===
2
6
 
3
7
  * Added PigJob!
data/README.mediawiki CHANGED
@@ -58,6 +58,30 @@ Like HiveJob, PigJob allows you to quickly launch Pig jobs :)
58
58
  > "j-16PZ24OED71C6"
59
59
  </pre>
60
60
 
61
+ === PARALLEL ===
62
+
63
+ Given the importance of specifying a reasonable value for [http://pig.apache.org/docs/r0.8.1/cookbook.html#Use+the+Parallel+Features PARALLEL] (the number of parallel reducers), Elasticity calculates and passes through a reasonable default up with every invocation in the form of a script variable called E_PARALLELS. This default value is based off of the formula in the Pig Cookbook and the number of reducers AWS configures per instance.
64
+
65
+ For example, if you had 8 instances in total and your slaves were m1.xlarge, the value is 26 (as shown below).
66
+
67
+ <pre>
68
+ s3://elasticmapreduce/libs/pig/pig-script
69
+ --run-pig-script
70
+ --args
71
+ -p INPUT=s3n://elasticmapreduce/samples/pig-apache/input
72
+ -p OUTPUT=s3n://slif-elasticity/pig-apache/output/2011-05-04
73
+ -p E_PARALLELS=26
74
+ s3n://elasticmapreduce/samples/pig-apache/do-reports.pig
75
+ </pre>
76
+
77
+ Use this as you would any other Pig variable.
78
+
79
+ <pre>
80
+ A = LOAD 'myfile' AS (t, u, v);
81
+ B = GROUP A BY t PARALLEL $E_PARALLELS;
82
+ ...
83
+ </pre>
84
+
61
85
  = Amazon API Reference =
62
86
 
63
87
  Elasticity wraps all of the EMR API calls. Please see the Amazon guide for details on these operations because the default values aren't obvious (e.g. the meaning of <code>DescribeJobFlows</code> without parameters).
@@ -2,9 +2,26 @@ module Elasticity
2
2
 
3
3
  class PigJob < Elasticity::SimpleJob
4
4
 
5
+ # Automatically passed as Pig argument E_PARALLELS
6
+ attr_reader :parallels
7
+
5
8
  def initialize(aws_access_key_id, aws_secret_access_key)
6
9
  super
7
10
  @name = "Elasticity Pig Job"
11
+ @parallels = calculate_parallels
12
+ end
13
+
14
+ def instance_count=(num_instances)
15
+ if num_instances < 2
16
+ raise ArgumentError, "Instance count cannot be set to less than 2 (requested #{num_instances})"
17
+ end
18
+ @instance_count = num_instances
19
+ @parallels = calculate_parallels
20
+ end
21
+
22
+ def slave_instance_type=(instance_type)
23
+ @slave_instance_type = instance_type
24
+ @parallels = calculate_parallels
8
25
  end
9
26
 
10
27
  # Run the specified Pig script with the specified variables.
@@ -23,6 +40,7 @@ module Elasticity
23
40
  pig_variables.keys.sort.each do |variable_name|
24
41
  script_arguments.concat(["-p", "#{variable_name}=#{pig_variables[variable_name]}"])
25
42
  end
43
+ script_arguments.concat(["-p", "E_PARALLELS=#{@parallels}"])
26
44
  script_arguments << pig_script
27
45
  jobflow_config = {
28
46
  :name => @name,
@@ -61,6 +79,32 @@ module Elasticity
61
79
 
62
80
  @emr.run_job_flow(jobflow_config)
63
81
  end
82
+
83
+ private
84
+
85
+ # Calculate a common-sense default value of PARALLELS using the following
86
+ # formula from the Pig Cookbook:
87
+ #
88
+ # <num machines> * <num reduce slots per machine> * 0.9
89
+ #
90
+ # With the following reducer configuration (from an AWS forum post):
91
+ #
92
+ # m1.small 1
93
+ # m1.large 2
94
+ # m1.xlarge 4
95
+ # c1.medium 2
96
+ # c1.xlarge 4
97
+ def calculate_parallels
98
+ reduce_slots = case @slave_instance_type
99
+ when "m1.small" then 1
100
+ when "m1.large" then 2
101
+ when "m1.xlarge" then 4
102
+ when "c1.medium" then 2
103
+ when "c1.xlarge" then 4
104
+ else 1
105
+ end
106
+ ((@instance_count - 1).to_f * reduce_slots.to_f * 0.9).ceil
107
+ end
64
108
 
65
109
  end
66
110
 
@@ -1,3 +1,3 @@
1
1
  module Elasticity
2
- VERSION = "1.2"
2
+ VERSION = "1.2.1"
3
3
  end
@@ -2,8 +2,8 @@
2
2
  - !ruby/struct:VCR::HTTPInteraction
3
3
  request: !ruby/struct:VCR::Request
4
4
  method: :get
5
- uri: !ruby/regexp /^http:\/\/elasticmapreduce\.amazonaws\.com:80\/\?AWSAccessKeyId=AKIAI7HEMMNKGT6VFFSA&Instances\.Ec2KeyName=sharethrough_dev&Instances\.HadoopVersion=0\.20&Instances\.InstanceCount=2&Instances\.MasterInstanceType=m1\.small&Instances\.SlaveInstanceType=m1\.small&LogUri=s3n:\/\/slif-elasticity\/pig-apache\/logs&Name=Elasticity%20Pig%20Job&Operation=RunJobFlow&.*&Steps\.member\.1\.ActionOnFailure=TERMINATE_JOB_FLOW&Steps\.member\.1\.HadoopJarStep\.Args\.member\.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps\.member\.1\.HadoopJarStep\.Args\.member\.2=--base-path&Steps\.member\.1\.HadoopJarStep\.Args\.member\.3=s3:\/\/elasticmapreduce\/libs\/pig\/&Steps\.member\.1\.HadoopJarStep\.Args\.member\.4=--install-pig&Steps\.member\.1\.HadoopJarStep\.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner\.jar&Steps\.member\.1\.Name=Setup%20Pig&Steps\.member\.2\.ActionOnFailure=TERMINATE_JOB_FLOW&Steps\.member\.2\.HadoopJarStep\.Args\.member\.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps\.member\.2\.HadoopJarStep\.Args\.member\.2=--run-pig-script&Steps\.member\.2\.HadoopJarStep\.Args\.member\.3=--args&Steps\.member\.2\.HadoopJarStep\.Args\.member\.4=-p&Steps\.member\.2\.HadoopJarStep\.Args\.member\.5=INPUT=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/input&Steps\.member\.2\.HadoopJarStep\.Args\.member\.6=-p&Steps\.member\.2\.HadoopJarStep\.Args\.member\.7=OUTPUT=s3n:\/\/slif-elasticity\/pig-apache\/output\/2011-05-04&Steps\.member\.2\.HadoopJarStep\.Args\.member\.8=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/do-reports\.pig&Steps\.member\.2\.HadoopJarStep\.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner\.jar&Steps\.member\.2\.Name=Run%20Pig%20Script/
6
- body:
5
+ uri: !ruby/regexp /^http:\/\/elasticmapreduce.amazonaws.com:80\/\?AWSAccessKeyId=AKIAI7HEMMNKGT6VFFSA&Instances.Ec2KeyName=sharethrough_dev&Instances.HadoopVersion=0.20&Instances.InstanceCount=2&Instances.MasterInstanceType=m1.small&Instances.SlaveInstanceType=m1.small&LogUri=s3n:\/\/slif-elasticity\/pig-apache\/logs&Name=Elasticity%20Pig%20Job&Operation=RunJobFlow&.*&Steps.member.1.ActionOnFailure=TERMINATE_JOB_FLOW&Steps.member.1.HadoopJarStep.Args.member.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps.member.1.HadoopJarStep.Args.member.2=--base-path&Steps.member.1.HadoopJarStep.Args.member.3=s3:\/\/elasticmapreduce\/libs\/pig\/&Steps.member.1.HadoopJarStep.Args.member.4=--install-pig&Steps.member.1.HadoopJarStep.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner.jar&Steps.member.1.Name=Setup%20Pig&Steps.member.2.ActionOnFailure=TERMINATE_JOB_FLOW&Steps.member.2.HadoopJarStep.Args.member.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps.member.2.HadoopJarStep.Args.member.10=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/do-reports.pig&Steps.member.2.HadoopJarStep.Args.member.2=--run-pig-script&Steps.member.2.HadoopJarStep.Args.member.3=--args&Steps.member.2.HadoopJarStep.Args.member.4=-p&Steps.member.2.HadoopJarStep.Args.member.5=INPUT=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/input&Steps.member.2.HadoopJarStep.Args.member.6=-p&Steps.member.2.HadoopJarStep.Args.member.7=OUTPUT=s3n:\/\/slif-elasticity\/pig-apache\/output\/2011-05-04&Steps.member.2.HadoopJarStep.Args.member.8=-p&Steps.member.2.HadoopJarStep.Args.member.9=E_PARALLELS=1&Steps.member.2.HadoopJarStep.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner.jar&Steps.member.2.Name=Run%20Pig%20Script/
6
+ body:
7
7
  headers:
8
8
  accept:
9
9
  - "*/*; q=0.5, application/xml"
@@ -15,20 +15,20 @@
15
15
  message: OK
16
16
  headers:
17
17
  x-amzn-requestid:
18
- - b237df84-761d-11e0-b625-05a26eeda1d8
18
+ - c00d285d-7935-11e0-a51d-7bf947dae271
19
19
  content-type:
20
20
  - text/xml
21
21
  date:
22
- - Wed, 04 May 2011 07:11:13 GMT
22
+ - Sun, 08 May 2011 05:40:58 GMT
23
23
  content-length:
24
24
  - "297"
25
25
  body: |
26
26
  <RunJobFlowResponse xmlns="http://elasticmapreduce.amazonaws.com/doc/2009-03-31">
27
27
  <RunJobFlowResult>
28
- <JobFlowId>j-16PZ24OED71C6</JobFlowId>
28
+ <JobFlowId>j-1HB7A3TBRT3VS</JobFlowId>
29
29
  </RunJobFlowResult>
30
30
  <ResponseMetadata>
31
- <RequestId>b237df84-761d-11e0-b625-05a26eeda1d8</RequestId>
31
+ <RequestId>c00d285d-7935-11e0-a51d-7bf947dae271</RequestId>
32
32
  </ResponseMetadata>
33
33
  </RunJobFlowResponse>
34
34
 
@@ -3,19 +3,78 @@ require 'spec_helper'
3
3
  describe Elasticity::PigJob do
4
4
 
5
5
  describe ".new" do
6
-
7
6
  it "should have good defaults" do
8
- hive = Elasticity::PigJob.new("access", "secret")
9
- hive.aws_access_key_id.should == "access"
10
- hive.aws_secret_access_key.should == "secret"
11
- hive.ec2_key_name.should == "default"
12
- hive.hadoop_version.should == "0.20"
13
- hive.instance_count.should == 2
14
- hive.master_instance_type.should == "m1.small"
15
- hive.name.should == "Elasticity Pig Job"
16
- hive.slave_instance_type.should == "m1.small"
17
- hive.action_on_failure.should == "TERMINATE_JOB_FLOW"
18
- hive.log_uri.should == nil
7
+ pig = Elasticity::PigJob.new("access", "secret")
8
+ pig.aws_access_key_id.should == "access"
9
+ pig.aws_secret_access_key.should == "secret"
10
+ pig.ec2_key_name.should == "default"
11
+ pig.hadoop_version.should == "0.20"
12
+ pig.instance_count.should == 2
13
+ pig.master_instance_type.should == "m1.small"
14
+ pig.name.should == "Elasticity Pig Job"
15
+ pig.slave_instance_type.should == "m1.small"
16
+ pig.action_on_failure.should == "TERMINATE_JOB_FLOW"
17
+ pig.log_uri.should == nil
18
+ pig.parallels.should == 1
19
+ end
20
+ end
21
+
22
+ describe "#instance_count=" do
23
+ it "should not allow instances to be set less than 2" do
24
+ pig = Elasticity::PigJob.new("access", "secret")
25
+ lambda {
26
+ pig.instance_count = 1
27
+ }.should raise_error(ArgumentError, "Instance count cannot be set to less than 2 (requested 1)")
28
+ end
29
+ end
30
+
31
+ describe "calculated value of parallels" do
32
+
33
+ before do
34
+ @pig = Elasticity::PigJob.new("access", "secret")
35
+ @pig.instance_count = 8
36
+ end
37
+
38
+ context "when slave is m1.small" do
39
+ it "should be 7" do
40
+ @pig.slave_instance_type = "m1.small"
41
+ @pig.parallels.should == 7
42
+ end
43
+ end
44
+
45
+ context "when slave is m1.large" do
46
+ it "should be 13" do
47
+ @pig.slave_instance_type = "m1.large"
48
+ @pig.parallels.should == 13
49
+ end
50
+ end
51
+
52
+ context "when slave is c1.medium" do
53
+ it "should be 13" do
54
+ @pig.slave_instance_type = "c1.medium"
55
+ @pig.parallels.should == 13
56
+ end
57
+ end
58
+
59
+ context "when slave is m1.xlarge" do
60
+ it "should be 26" do
61
+ @pig.slave_instance_type = "m1.xlarge"
62
+ @pig.parallels.should == 26
63
+ end
64
+ end
65
+
66
+ context "when slave is c1.xlarge" do
67
+ it "should be 26" do
68
+ @pig.slave_instance_type = "c1.xlarge"
69
+ @pig.parallels.should == 26
70
+ end
71
+ end
72
+
73
+ context "when slave is any other type" do
74
+ it "should be 1" do
75
+ @pig.slave_instance_type = "foo"
76
+ @pig.parallels.should == 7
77
+ end
19
78
  end
20
79
 
21
80
  end
@@ -30,9 +89,9 @@ describe Elasticity::PigJob do
30
89
  :instances => {
31
90
  :ec2_key_name => "default",
32
91
  :hadoop_version => "0.20",
33
- :instance_count => 2,
92
+ :instance_count => 8,
34
93
  :master_instance_type => "m1.small",
35
- :slave_instance_type => "m1.small",
94
+ :slave_instance_type => "m1.xlarge",
36
95
  },
37
96
  :steps => [
38
97
  {
@@ -58,6 +117,7 @@ describe Elasticity::PigJob do
58
117
  "--args",
59
118
  "-p", "OUTPUT=s3n://slif-pig-test/output",
60
119
  "-p", "XREFS=s3n://slif-pig-test/xrefs",
120
+ "-p", "E_PARALLELS=26",
61
121
  "s3n://slif-pig-test/test.pig"
62
122
  ],
63
123
  },
@@ -65,11 +125,15 @@ describe Elasticity::PigJob do
65
125
  }
66
126
  ]
67
127
  }).and_return("new_jobflow_id")
68
- Elasticity::EMR.should_receive(:new).with("access", "secret").and_return(aws)
69
128
 
129
+ Elasticity::EMR.should_receive(:new).with("access", "secret").and_return(aws)
70
130
  pig = Elasticity::PigJob.new("access", "secret")
131
+
71
132
  pig.log_uri = "s3n://slif-test/output/logs"
72
133
  pig.action_on_failure = "CONTINUE"
134
+ pig.instance_count = 8
135
+ pig.slave_instance_type = "m1.xlarge"
136
+
73
137
  jobflow_id = pig.run('s3n://slif-pig-test/test.pig', {
74
138
  'OUTPUT' => 's3n://slif-pig-test/output',
75
139
  'XREFS' => 's3n://slif-pig-test/xrefs'
@@ -89,7 +153,7 @@ describe Elasticity::PigJob do
89
153
  "INPUT" => "s3n://elasticmapreduce/samples/pig-apache/input",
90
154
  "OUTPUT" => "s3n://slif-elasticity/pig-apache/output/2011-05-04"
91
155
  })
92
- jobflow_id.should == "j-16PZ24OED71C6"
156
+ jobflow_id.should == "j-1HB7A3TBRT3VS"
93
157
  end
94
158
  end
95
159
 
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: elasticity
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
4
+ hash: 29
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 2
9
- version: "1.2"
9
+ - 1
10
+ version: 1.2.1
10
11
  platform: ruby
11
12
  authors:
12
13
  - Robert Slifka
@@ -14,7 +15,7 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2011-05-04 00:00:00 -07:00
18
+ date: 2011-05-07 00:00:00 -07:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency