elasticity 1.2 → 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
data/HISTORY.mediawiki
CHANGED
data/README.mediawiki
CHANGED
@@ -58,6 +58,30 @@ Like HiveJob, PigJob allows you to quickly launch Pig jobs :)
|
|
58
58
|
> "j-16PZ24OED71C6"
|
59
59
|
</pre>
|
60
60
|
|
61
|
+
=== PARALLEL ===
|
62
|
+
|
63
|
+
Given the importance of specifying a reasonable value for [http://pig.apache.org/docs/r0.8.1/cookbook.html#Use+the+Parallel+Features PARALLEL] (the number of parallel reducers), Elasticity calculates and passes through a reasonable default up with every invocation in the form of a script variable called E_PARALLELS. This default value is based off of the formula in the Pig Cookbook and the number of reducers AWS configures per instance.
|
64
|
+
|
65
|
+
For example, if you had 8 instances in total and your slaves were m1.xlarge, the value is 26 (as shown below).
|
66
|
+
|
67
|
+
<pre>
|
68
|
+
s3://elasticmapreduce/libs/pig/pig-script
|
69
|
+
--run-pig-script
|
70
|
+
--args
|
71
|
+
-p INPUT=s3n://elasticmapreduce/samples/pig-apache/input
|
72
|
+
-p OUTPUT=s3n://slif-elasticity/pig-apache/output/2011-05-04
|
73
|
+
-p E_PARALLELS=26
|
74
|
+
s3n://elasticmapreduce/samples/pig-apache/do-reports.pig
|
75
|
+
</pre>
|
76
|
+
|
77
|
+
Use this as you would any other Pig variable.
|
78
|
+
|
79
|
+
<pre>
|
80
|
+
A = LOAD 'myfile' AS (t, u, v);
|
81
|
+
B = GROUP A BY t PARALLEL $E_PARALLELS;
|
82
|
+
...
|
83
|
+
</pre>
|
84
|
+
|
61
85
|
= Amazon API Reference =
|
62
86
|
|
63
87
|
Elasticity wraps all of the EMR API calls. Please see the Amazon guide for details on these operations because the default values aren't obvious (e.g. the meaning of <code>DescribeJobFlows</code> without parameters).
|
data/lib/elasticity/pig_job.rb
CHANGED
@@ -2,9 +2,26 @@ module Elasticity
|
|
2
2
|
|
3
3
|
class PigJob < Elasticity::SimpleJob
|
4
4
|
|
5
|
+
# Automatically passed as Pig argument E_PARALLELS
|
6
|
+
attr_reader :parallels
|
7
|
+
|
5
8
|
def initialize(aws_access_key_id, aws_secret_access_key)
|
6
9
|
super
|
7
10
|
@name = "Elasticity Pig Job"
|
11
|
+
@parallels = calculate_parallels
|
12
|
+
end
|
13
|
+
|
14
|
+
def instance_count=(num_instances)
|
15
|
+
if num_instances < 2
|
16
|
+
raise ArgumentError, "Instance count cannot be set to less than 2 (requested #{num_instances})"
|
17
|
+
end
|
18
|
+
@instance_count = num_instances
|
19
|
+
@parallels = calculate_parallels
|
20
|
+
end
|
21
|
+
|
22
|
+
def slave_instance_type=(instance_type)
|
23
|
+
@slave_instance_type = instance_type
|
24
|
+
@parallels = calculate_parallels
|
8
25
|
end
|
9
26
|
|
10
27
|
# Run the specified Pig script with the specified variables.
|
@@ -23,6 +40,7 @@ module Elasticity
|
|
23
40
|
pig_variables.keys.sort.each do |variable_name|
|
24
41
|
script_arguments.concat(["-p", "#{variable_name}=#{pig_variables[variable_name]}"])
|
25
42
|
end
|
43
|
+
script_arguments.concat(["-p", "E_PARALLELS=#{@parallels}"])
|
26
44
|
script_arguments << pig_script
|
27
45
|
jobflow_config = {
|
28
46
|
:name => @name,
|
@@ -61,6 +79,32 @@ module Elasticity
|
|
61
79
|
|
62
80
|
@emr.run_job_flow(jobflow_config)
|
63
81
|
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
# Calculate a common-sense default value of PARALLELS using the following
|
86
|
+
# formula from the Pig Cookbook:
|
87
|
+
#
|
88
|
+
# <num machines> * <num reduce slots per machine> * 0.9
|
89
|
+
#
|
90
|
+
# With the following reducer configuration (from an AWS forum post):
|
91
|
+
#
|
92
|
+
# m1.small 1
|
93
|
+
# m1.large 2
|
94
|
+
# m1.xlarge 4
|
95
|
+
# c1.medium 2
|
96
|
+
# c1.xlarge 4
|
97
|
+
def calculate_parallels
|
98
|
+
reduce_slots = case @slave_instance_type
|
99
|
+
when "m1.small" then 1
|
100
|
+
when "m1.large" then 2
|
101
|
+
when "m1.xlarge" then 4
|
102
|
+
when "c1.medium" then 2
|
103
|
+
when "c1.xlarge" then 4
|
104
|
+
else 1
|
105
|
+
end
|
106
|
+
((@instance_count - 1).to_f * reduce_slots.to_f * 0.9).ceil
|
107
|
+
end
|
64
108
|
|
65
109
|
end
|
66
110
|
|
data/lib/elasticity/version.rb
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
- !ruby/struct:VCR::HTTPInteraction
|
3
3
|
request: !ruby/struct:VCR::Request
|
4
4
|
method: :get
|
5
|
-
uri: !ruby/regexp /^http:\/\/elasticmapreduce
|
6
|
-
body:
|
5
|
+
uri: !ruby/regexp /^http:\/\/elasticmapreduce.amazonaws.com:80\/\?AWSAccessKeyId=AKIAI7HEMMNKGT6VFFSA&Instances.Ec2KeyName=sharethrough_dev&Instances.HadoopVersion=0.20&Instances.InstanceCount=2&Instances.MasterInstanceType=m1.small&Instances.SlaveInstanceType=m1.small&LogUri=s3n:\/\/slif-elasticity\/pig-apache\/logs&Name=Elasticity%20Pig%20Job&Operation=RunJobFlow&.*&Steps.member.1.ActionOnFailure=TERMINATE_JOB_FLOW&Steps.member.1.HadoopJarStep.Args.member.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps.member.1.HadoopJarStep.Args.member.2=--base-path&Steps.member.1.HadoopJarStep.Args.member.3=s3:\/\/elasticmapreduce\/libs\/pig\/&Steps.member.1.HadoopJarStep.Args.member.4=--install-pig&Steps.member.1.HadoopJarStep.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner.jar&Steps.member.1.Name=Setup%20Pig&Steps.member.2.ActionOnFailure=TERMINATE_JOB_FLOW&Steps.member.2.HadoopJarStep.Args.member.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps.member.2.HadoopJarStep.Args.member.10=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/do-reports.pig&Steps.member.2.HadoopJarStep.Args.member.2=--run-pig-script&Steps.member.2.HadoopJarStep.Args.member.3=--args&Steps.member.2.HadoopJarStep.Args.member.4=-p&Steps.member.2.HadoopJarStep.Args.member.5=INPUT=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/input&Steps.member.2.HadoopJarStep.Args.member.6=-p&Steps.member.2.HadoopJarStep.Args.member.7=OUTPUT=s3n:\/\/slif-elasticity\/pig-apache\/output\/2011-05-04&Steps.member.2.HadoopJarStep.Args.member.8=-p&Steps.member.2.HadoopJarStep.Args.member.9=E_PARALLELS=1&Steps.member.2.HadoopJarStep.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner.jar&Steps.member.2.Name=Run%20Pig%20Script/
|
6
|
+
body:
|
7
7
|
headers:
|
8
8
|
accept:
|
9
9
|
- "*/*; q=0.5, application/xml"
|
@@ -15,20 +15,20 @@
|
|
15
15
|
message: OK
|
16
16
|
headers:
|
17
17
|
x-amzn-requestid:
|
18
|
-
-
|
18
|
+
- c00d285d-7935-11e0-a51d-7bf947dae271
|
19
19
|
content-type:
|
20
20
|
- text/xml
|
21
21
|
date:
|
22
|
-
-
|
22
|
+
- Sun, 08 May 2011 05:40:58 GMT
|
23
23
|
content-length:
|
24
24
|
- "297"
|
25
25
|
body: |
|
26
26
|
<RunJobFlowResponse xmlns="http://elasticmapreduce.amazonaws.com/doc/2009-03-31">
|
27
27
|
<RunJobFlowResult>
|
28
|
-
<JobFlowId>j-
|
28
|
+
<JobFlowId>j-1HB7A3TBRT3VS</JobFlowId>
|
29
29
|
</RunJobFlowResult>
|
30
30
|
<ResponseMetadata>
|
31
|
-
<RequestId>
|
31
|
+
<RequestId>c00d285d-7935-11e0-a51d-7bf947dae271</RequestId>
|
32
32
|
</ResponseMetadata>
|
33
33
|
</RunJobFlowResponse>
|
34
34
|
|
@@ -3,19 +3,78 @@ require 'spec_helper'
|
|
3
3
|
describe Elasticity::PigJob do
|
4
4
|
|
5
5
|
describe ".new" do
|
6
|
-
|
7
6
|
it "should have good defaults" do
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
7
|
+
pig = Elasticity::PigJob.new("access", "secret")
|
8
|
+
pig.aws_access_key_id.should == "access"
|
9
|
+
pig.aws_secret_access_key.should == "secret"
|
10
|
+
pig.ec2_key_name.should == "default"
|
11
|
+
pig.hadoop_version.should == "0.20"
|
12
|
+
pig.instance_count.should == 2
|
13
|
+
pig.master_instance_type.should == "m1.small"
|
14
|
+
pig.name.should == "Elasticity Pig Job"
|
15
|
+
pig.slave_instance_type.should == "m1.small"
|
16
|
+
pig.action_on_failure.should == "TERMINATE_JOB_FLOW"
|
17
|
+
pig.log_uri.should == nil
|
18
|
+
pig.parallels.should == 1
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe "#instance_count=" do
|
23
|
+
it "should not allow instances to be set less than 2" do
|
24
|
+
pig = Elasticity::PigJob.new("access", "secret")
|
25
|
+
lambda {
|
26
|
+
pig.instance_count = 1
|
27
|
+
}.should raise_error(ArgumentError, "Instance count cannot be set to less than 2 (requested 1)")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "calculated value of parallels" do
|
32
|
+
|
33
|
+
before do
|
34
|
+
@pig = Elasticity::PigJob.new("access", "secret")
|
35
|
+
@pig.instance_count = 8
|
36
|
+
end
|
37
|
+
|
38
|
+
context "when slave is m1.small" do
|
39
|
+
it "should be 7" do
|
40
|
+
@pig.slave_instance_type = "m1.small"
|
41
|
+
@pig.parallels.should == 7
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context "when slave is m1.large" do
|
46
|
+
it "should be 13" do
|
47
|
+
@pig.slave_instance_type = "m1.large"
|
48
|
+
@pig.parallels.should == 13
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context "when slave is c1.medium" do
|
53
|
+
it "should be 13" do
|
54
|
+
@pig.slave_instance_type = "c1.medium"
|
55
|
+
@pig.parallels.should == 13
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context "when slave is m1.xlarge" do
|
60
|
+
it "should be 26" do
|
61
|
+
@pig.slave_instance_type = "m1.xlarge"
|
62
|
+
@pig.parallels.should == 26
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
context "when slave is c1.xlarge" do
|
67
|
+
it "should be 26" do
|
68
|
+
@pig.slave_instance_type = "c1.xlarge"
|
69
|
+
@pig.parallels.should == 26
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context "when slave is any other type" do
|
74
|
+
it "should be 1" do
|
75
|
+
@pig.slave_instance_type = "foo"
|
76
|
+
@pig.parallels.should == 7
|
77
|
+
end
|
19
78
|
end
|
20
79
|
|
21
80
|
end
|
@@ -30,9 +89,9 @@ describe Elasticity::PigJob do
|
|
30
89
|
:instances => {
|
31
90
|
:ec2_key_name => "default",
|
32
91
|
:hadoop_version => "0.20",
|
33
|
-
:instance_count =>
|
92
|
+
:instance_count => 8,
|
34
93
|
:master_instance_type => "m1.small",
|
35
|
-
:slave_instance_type => "m1.
|
94
|
+
:slave_instance_type => "m1.xlarge",
|
36
95
|
},
|
37
96
|
:steps => [
|
38
97
|
{
|
@@ -58,6 +117,7 @@ describe Elasticity::PigJob do
|
|
58
117
|
"--args",
|
59
118
|
"-p", "OUTPUT=s3n://slif-pig-test/output",
|
60
119
|
"-p", "XREFS=s3n://slif-pig-test/xrefs",
|
120
|
+
"-p", "E_PARALLELS=26",
|
61
121
|
"s3n://slif-pig-test/test.pig"
|
62
122
|
],
|
63
123
|
},
|
@@ -65,11 +125,15 @@ describe Elasticity::PigJob do
|
|
65
125
|
}
|
66
126
|
]
|
67
127
|
}).and_return("new_jobflow_id")
|
68
|
-
Elasticity::EMR.should_receive(:new).with("access", "secret").and_return(aws)
|
69
128
|
|
129
|
+
Elasticity::EMR.should_receive(:new).with("access", "secret").and_return(aws)
|
70
130
|
pig = Elasticity::PigJob.new("access", "secret")
|
131
|
+
|
71
132
|
pig.log_uri = "s3n://slif-test/output/logs"
|
72
133
|
pig.action_on_failure = "CONTINUE"
|
134
|
+
pig.instance_count = 8
|
135
|
+
pig.slave_instance_type = "m1.xlarge"
|
136
|
+
|
73
137
|
jobflow_id = pig.run('s3n://slif-pig-test/test.pig', {
|
74
138
|
'OUTPUT' => 's3n://slif-pig-test/output',
|
75
139
|
'XREFS' => 's3n://slif-pig-test/xrefs'
|
@@ -89,7 +153,7 @@ describe Elasticity::PigJob do
|
|
89
153
|
"INPUT" => "s3n://elasticmapreduce/samples/pig-apache/input",
|
90
154
|
"OUTPUT" => "s3n://slif-elasticity/pig-apache/output/2011-05-04"
|
91
155
|
})
|
92
|
-
jobflow_id.should == "j-
|
156
|
+
jobflow_id.should == "j-1HB7A3TBRT3VS"
|
93
157
|
end
|
94
158
|
end
|
95
159
|
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: elasticity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 29
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 2
|
9
|
-
|
9
|
+
- 1
|
10
|
+
version: 1.2.1
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Robert Slifka
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2011-05-
|
18
|
+
date: 2011-05-07 00:00:00 -07:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|