elasticity 1.2 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/HISTORY.mediawiki
CHANGED
data/README.mediawiki
CHANGED
@@ -58,6 +58,30 @@ Like HiveJob, PigJob allows you to quickly launch Pig jobs :)
|
|
58
58
|
> "j-16PZ24OED71C6"
|
59
59
|
</pre>
|
60
60
|
|
61
|
+
=== PARALLEL ===
|
62
|
+
|
63
|
+
Given the importance of specifying a reasonable value for [http://pig.apache.org/docs/r0.8.1/cookbook.html#Use+the+Parallel+Features PARALLEL] (the number of parallel reducers), Elasticity calculates and passes through a reasonable default up with every invocation in the form of a script variable called E_PARALLELS. This default value is based off of the formula in the Pig Cookbook and the number of reducers AWS configures per instance.
|
64
|
+
|
65
|
+
For example, if you had 8 instances in total and your slaves were m1.xlarge, the value is 26 (as shown below).
|
66
|
+
|
67
|
+
<pre>
|
68
|
+
s3://elasticmapreduce/libs/pig/pig-script
|
69
|
+
--run-pig-script
|
70
|
+
--args
|
71
|
+
-p INPUT=s3n://elasticmapreduce/samples/pig-apache/input
|
72
|
+
-p OUTPUT=s3n://slif-elasticity/pig-apache/output/2011-05-04
|
73
|
+
-p E_PARALLELS=26
|
74
|
+
s3n://elasticmapreduce/samples/pig-apache/do-reports.pig
|
75
|
+
</pre>
|
76
|
+
|
77
|
+
Use this as you would any other Pig variable.
|
78
|
+
|
79
|
+
<pre>
|
80
|
+
A = LOAD 'myfile' AS (t, u, v);
|
81
|
+
B = GROUP A BY t PARALLEL $E_PARALLELS;
|
82
|
+
...
|
83
|
+
</pre>
|
84
|
+
|
61
85
|
= Amazon API Reference =
|
62
86
|
|
63
87
|
Elasticity wraps all of the EMR API calls. Please see the Amazon guide for details on these operations because the default values aren't obvious (e.g. the meaning of <code>DescribeJobFlows</code> without parameters).
|
data/lib/elasticity/pig_job.rb
CHANGED
@@ -2,9 +2,26 @@ module Elasticity
|
|
2
2
|
|
3
3
|
class PigJob < Elasticity::SimpleJob
|
4
4
|
|
5
|
+
# Automatically passed as Pig argument E_PARALLELS
|
6
|
+
attr_reader :parallels
|
7
|
+
|
5
8
|
def initialize(aws_access_key_id, aws_secret_access_key)
|
6
9
|
super
|
7
10
|
@name = "Elasticity Pig Job"
|
11
|
+
@parallels = calculate_parallels
|
12
|
+
end
|
13
|
+
|
14
|
+
def instance_count=(num_instances)
|
15
|
+
if num_instances < 2
|
16
|
+
raise ArgumentError, "Instance count cannot be set to less than 2 (requested #{num_instances})"
|
17
|
+
end
|
18
|
+
@instance_count = num_instances
|
19
|
+
@parallels = calculate_parallels
|
20
|
+
end
|
21
|
+
|
22
|
+
def slave_instance_type=(instance_type)
|
23
|
+
@slave_instance_type = instance_type
|
24
|
+
@parallels = calculate_parallels
|
8
25
|
end
|
9
26
|
|
10
27
|
# Run the specified Pig script with the specified variables.
|
@@ -23,6 +40,7 @@ module Elasticity
|
|
23
40
|
pig_variables.keys.sort.each do |variable_name|
|
24
41
|
script_arguments.concat(["-p", "#{variable_name}=#{pig_variables[variable_name]}"])
|
25
42
|
end
|
43
|
+
script_arguments.concat(["-p", "E_PARALLELS=#{@parallels}"])
|
26
44
|
script_arguments << pig_script
|
27
45
|
jobflow_config = {
|
28
46
|
:name => @name,
|
@@ -61,6 +79,32 @@ module Elasticity
|
|
61
79
|
|
62
80
|
@emr.run_job_flow(jobflow_config)
|
63
81
|
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
# Calculate a common-sense default value of PARALLELS using the following
|
86
|
+
# formula from the Pig Cookbook:
|
87
|
+
#
|
88
|
+
# <num machines> * <num reduce slots per machine> * 0.9
|
89
|
+
#
|
90
|
+
# With the following reducer configuration (from an AWS forum post):
|
91
|
+
#
|
92
|
+
# m1.small 1
|
93
|
+
# m1.large 2
|
94
|
+
# m1.xlarge 4
|
95
|
+
# c1.medium 2
|
96
|
+
# c1.xlarge 4
|
97
|
+
def calculate_parallels
|
98
|
+
reduce_slots = case @slave_instance_type
|
99
|
+
when "m1.small" then 1
|
100
|
+
when "m1.large" then 2
|
101
|
+
when "m1.xlarge" then 4
|
102
|
+
when "c1.medium" then 2
|
103
|
+
when "c1.xlarge" then 4
|
104
|
+
else 1
|
105
|
+
end
|
106
|
+
((@instance_count - 1).to_f * reduce_slots.to_f * 0.9).ceil
|
107
|
+
end
|
64
108
|
|
65
109
|
end
|
66
110
|
|
data/lib/elasticity/version.rb
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
- !ruby/struct:VCR::HTTPInteraction
|
3
3
|
request: !ruby/struct:VCR::Request
|
4
4
|
method: :get
|
5
|
-
uri: !ruby/regexp /^http:\/\/elasticmapreduce
|
6
|
-
body:
|
5
|
+
uri: !ruby/regexp /^http:\/\/elasticmapreduce.amazonaws.com:80\/\?AWSAccessKeyId=AKIAI7HEMMNKGT6VFFSA&Instances.Ec2KeyName=sharethrough_dev&Instances.HadoopVersion=0.20&Instances.InstanceCount=2&Instances.MasterInstanceType=m1.small&Instances.SlaveInstanceType=m1.small&LogUri=s3n:\/\/slif-elasticity\/pig-apache\/logs&Name=Elasticity%20Pig%20Job&Operation=RunJobFlow&.*&Steps.member.1.ActionOnFailure=TERMINATE_JOB_FLOW&Steps.member.1.HadoopJarStep.Args.member.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps.member.1.HadoopJarStep.Args.member.2=--base-path&Steps.member.1.HadoopJarStep.Args.member.3=s3:\/\/elasticmapreduce\/libs\/pig\/&Steps.member.1.HadoopJarStep.Args.member.4=--install-pig&Steps.member.1.HadoopJarStep.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner.jar&Steps.member.1.Name=Setup%20Pig&Steps.member.2.ActionOnFailure=TERMINATE_JOB_FLOW&Steps.member.2.HadoopJarStep.Args.member.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps.member.2.HadoopJarStep.Args.member.10=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/do-reports.pig&Steps.member.2.HadoopJarStep.Args.member.2=--run-pig-script&Steps.member.2.HadoopJarStep.Args.member.3=--args&Steps.member.2.HadoopJarStep.Args.member.4=-p&Steps.member.2.HadoopJarStep.Args.member.5=INPUT=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/input&Steps.member.2.HadoopJarStep.Args.member.6=-p&Steps.member.2.HadoopJarStep.Args.member.7=OUTPUT=s3n:\/\/slif-elasticity\/pig-apache\/output\/2011-05-04&Steps.member.2.HadoopJarStep.Args.member.8=-p&Steps.member.2.HadoopJarStep.Args.member.9=E_PARALLELS=1&Steps.member.2.HadoopJarStep.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner.jar&Steps.member.2.Name=Run%20Pig%20Script/
|
6
|
+
body:
|
7
7
|
headers:
|
8
8
|
accept:
|
9
9
|
- "*/*; q=0.5, application/xml"
|
@@ -15,20 +15,20 @@
|
|
15
15
|
message: OK
|
16
16
|
headers:
|
17
17
|
x-amzn-requestid:
|
18
|
-
-
|
18
|
+
- c00d285d-7935-11e0-a51d-7bf947dae271
|
19
19
|
content-type:
|
20
20
|
- text/xml
|
21
21
|
date:
|
22
|
-
-
|
22
|
+
- Sun, 08 May 2011 05:40:58 GMT
|
23
23
|
content-length:
|
24
24
|
- "297"
|
25
25
|
body: |
|
26
26
|
<RunJobFlowResponse xmlns="http://elasticmapreduce.amazonaws.com/doc/2009-03-31">
|
27
27
|
<RunJobFlowResult>
|
28
|
-
<JobFlowId>j-
|
28
|
+
<JobFlowId>j-1HB7A3TBRT3VS</JobFlowId>
|
29
29
|
</RunJobFlowResult>
|
30
30
|
<ResponseMetadata>
|
31
|
-
<RequestId>
|
31
|
+
<RequestId>c00d285d-7935-11e0-a51d-7bf947dae271</RequestId>
|
32
32
|
</ResponseMetadata>
|
33
33
|
</RunJobFlowResponse>
|
34
34
|
|
@@ -3,19 +3,78 @@ require 'spec_helper'
|
|
3
3
|
describe Elasticity::PigJob do
|
4
4
|
|
5
5
|
describe ".new" do
|
6
|
-
|
7
6
|
it "should have good defaults" do
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
7
|
+
pig = Elasticity::PigJob.new("access", "secret")
|
8
|
+
pig.aws_access_key_id.should == "access"
|
9
|
+
pig.aws_secret_access_key.should == "secret"
|
10
|
+
pig.ec2_key_name.should == "default"
|
11
|
+
pig.hadoop_version.should == "0.20"
|
12
|
+
pig.instance_count.should == 2
|
13
|
+
pig.master_instance_type.should == "m1.small"
|
14
|
+
pig.name.should == "Elasticity Pig Job"
|
15
|
+
pig.slave_instance_type.should == "m1.small"
|
16
|
+
pig.action_on_failure.should == "TERMINATE_JOB_FLOW"
|
17
|
+
pig.log_uri.should == nil
|
18
|
+
pig.parallels.should == 1
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe "#instance_count=" do
|
23
|
+
it "should not allow instances to be set less than 2" do
|
24
|
+
pig = Elasticity::PigJob.new("access", "secret")
|
25
|
+
lambda {
|
26
|
+
pig.instance_count = 1
|
27
|
+
}.should raise_error(ArgumentError, "Instance count cannot be set to less than 2 (requested 1)")
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "calculated value of parallels" do
|
32
|
+
|
33
|
+
before do
|
34
|
+
@pig = Elasticity::PigJob.new("access", "secret")
|
35
|
+
@pig.instance_count = 8
|
36
|
+
end
|
37
|
+
|
38
|
+
context "when slave is m1.small" do
|
39
|
+
it "should be 7" do
|
40
|
+
@pig.slave_instance_type = "m1.small"
|
41
|
+
@pig.parallels.should == 7
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context "when slave is m1.large" do
|
46
|
+
it "should be 13" do
|
47
|
+
@pig.slave_instance_type = "m1.large"
|
48
|
+
@pig.parallels.should == 13
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
context "when slave is c1.medium" do
|
53
|
+
it "should be 13" do
|
54
|
+
@pig.slave_instance_type = "c1.medium"
|
55
|
+
@pig.parallels.should == 13
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
context "when slave is m1.xlarge" do
|
60
|
+
it "should be 26" do
|
61
|
+
@pig.slave_instance_type = "m1.xlarge"
|
62
|
+
@pig.parallels.should == 26
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
context "when slave is c1.xlarge" do
|
67
|
+
it "should be 26" do
|
68
|
+
@pig.slave_instance_type = "c1.xlarge"
|
69
|
+
@pig.parallels.should == 26
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context "when slave is any other type" do
|
74
|
+
it "should be 1" do
|
75
|
+
@pig.slave_instance_type = "foo"
|
76
|
+
@pig.parallels.should == 7
|
77
|
+
end
|
19
78
|
end
|
20
79
|
|
21
80
|
end
|
@@ -30,9 +89,9 @@ describe Elasticity::PigJob do
|
|
30
89
|
:instances => {
|
31
90
|
:ec2_key_name => "default",
|
32
91
|
:hadoop_version => "0.20",
|
33
|
-
:instance_count =>
|
92
|
+
:instance_count => 8,
|
34
93
|
:master_instance_type => "m1.small",
|
35
|
-
:slave_instance_type => "m1.
|
94
|
+
:slave_instance_type => "m1.xlarge",
|
36
95
|
},
|
37
96
|
:steps => [
|
38
97
|
{
|
@@ -58,6 +117,7 @@ describe Elasticity::PigJob do
|
|
58
117
|
"--args",
|
59
118
|
"-p", "OUTPUT=s3n://slif-pig-test/output",
|
60
119
|
"-p", "XREFS=s3n://slif-pig-test/xrefs",
|
120
|
+
"-p", "E_PARALLELS=26",
|
61
121
|
"s3n://slif-pig-test/test.pig"
|
62
122
|
],
|
63
123
|
},
|
@@ -65,11 +125,15 @@ describe Elasticity::PigJob do
|
|
65
125
|
}
|
66
126
|
]
|
67
127
|
}).and_return("new_jobflow_id")
|
68
|
-
Elasticity::EMR.should_receive(:new).with("access", "secret").and_return(aws)
|
69
128
|
|
129
|
+
Elasticity::EMR.should_receive(:new).with("access", "secret").and_return(aws)
|
70
130
|
pig = Elasticity::PigJob.new("access", "secret")
|
131
|
+
|
71
132
|
pig.log_uri = "s3n://slif-test/output/logs"
|
72
133
|
pig.action_on_failure = "CONTINUE"
|
134
|
+
pig.instance_count = 8
|
135
|
+
pig.slave_instance_type = "m1.xlarge"
|
136
|
+
|
73
137
|
jobflow_id = pig.run('s3n://slif-pig-test/test.pig', {
|
74
138
|
'OUTPUT' => 's3n://slif-pig-test/output',
|
75
139
|
'XREFS' => 's3n://slif-pig-test/xrefs'
|
@@ -89,7 +153,7 @@ describe Elasticity::PigJob do
|
|
89
153
|
"INPUT" => "s3n://elasticmapreduce/samples/pig-apache/input",
|
90
154
|
"OUTPUT" => "s3n://slif-elasticity/pig-apache/output/2011-05-04"
|
91
155
|
})
|
92
|
-
jobflow_id.should == "j-
|
156
|
+
jobflow_id.should == "j-1HB7A3TBRT3VS"
|
93
157
|
end
|
94
158
|
end
|
95
159
|
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: elasticity
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 29
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 2
|
9
|
-
|
9
|
+
- 1
|
10
|
+
version: 1.2.1
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- Robert Slifka
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2011-05-
|
18
|
+
date: 2011-05-07 00:00:00 -07:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|