elasticity 1.2 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/HISTORY.mediawiki CHANGED
@@ -1,3 +1,7 @@
1
+ === 1.2.1 ===
2
+
3
+ * Shipping up E_PARALLELS Pig variable with each invocation; reasonable default value for PARALLEL based on the number and type of instances configured.
4
+
1
5
  === 1.2 ===
2
6
 
3
7
  * Added PigJob!
data/README.mediawiki CHANGED
@@ -58,6 +58,30 @@ Like HiveJob, PigJob allows you to quickly launch Pig jobs :)
58
58
  > "j-16PZ24OED71C6"
59
59
  </pre>
60
60
 
61
+ === PARALLEL ===
62
+
63
+ Given the importance of specifying a reasonable value for [http://pig.apache.org/docs/r0.8.1/cookbook.html#Use+the+Parallel+Features PARALLEL] (the number of parallel reducers), Elasticity calculates and passes through a reasonable default up with every invocation in the form of a script variable called E_PARALLELS. This default value is based off of the formula in the Pig Cookbook and the number of reducers AWS configures per instance.
64
+
65
+ For example, if you had 8 instances in total and your slaves were m1.xlarge, the value is 26 (as shown below).
66
+
67
+ <pre>
68
+ s3://elasticmapreduce/libs/pig/pig-script
69
+ --run-pig-script
70
+ --args
71
+ -p INPUT=s3n://elasticmapreduce/samples/pig-apache/input
72
+ -p OUTPUT=s3n://slif-elasticity/pig-apache/output/2011-05-04
73
+ -p E_PARALLELS=26
74
+ s3n://elasticmapreduce/samples/pig-apache/do-reports.pig
75
+ </pre>
76
+
77
+ Use this as you would any other Pig variable.
78
+
79
+ <pre>
80
+ A = LOAD 'myfile' AS (t, u, v);
81
+ B = GROUP A BY t PARALLEL $E_PARALLELS;
82
+ ...
83
+ </pre>
84
+
61
85
  = Amazon API Reference =
62
86
 
63
87
  Elasticity wraps all of the EMR API calls. Please see the Amazon guide for details on these operations because the default values aren't obvious (e.g. the meaning of <code>DescribeJobFlows</code> without parameters).
@@ -2,9 +2,26 @@ module Elasticity
2
2
 
3
3
  class PigJob < Elasticity::SimpleJob
4
4
 
5
+ # Automatically passed as Pig argument E_PARALLELS
6
+ attr_reader :parallels
7
+
5
8
  def initialize(aws_access_key_id, aws_secret_access_key)
6
9
  super
7
10
  @name = "Elasticity Pig Job"
11
+ @parallels = calculate_parallels
12
+ end
13
+
14
+ def instance_count=(num_instances)
15
+ if num_instances < 2
16
+ raise ArgumentError, "Instance count cannot be set to less than 2 (requested #{num_instances})"
17
+ end
18
+ @instance_count = num_instances
19
+ @parallels = calculate_parallels
20
+ end
21
+
22
+ def slave_instance_type=(instance_type)
23
+ @slave_instance_type = instance_type
24
+ @parallels = calculate_parallels
8
25
  end
9
26
 
10
27
  # Run the specified Pig script with the specified variables.
@@ -23,6 +40,7 @@ module Elasticity
23
40
  pig_variables.keys.sort.each do |variable_name|
24
41
  script_arguments.concat(["-p", "#{variable_name}=#{pig_variables[variable_name]}"])
25
42
  end
43
+ script_arguments.concat(["-p", "E_PARALLELS=#{@parallels}"])
26
44
  script_arguments << pig_script
27
45
  jobflow_config = {
28
46
  :name => @name,
@@ -61,6 +79,32 @@ module Elasticity
61
79
 
62
80
  @emr.run_job_flow(jobflow_config)
63
81
  end
82
+
83
+ private
84
+
85
+ # Calculate a common-sense default value of PARALLELS using the following
86
+ # formula from the Pig Cookbook:
87
+ #
88
+ # <num machines> * <num reduce slots per machine> * 0.9
89
+ #
90
+ # With the following reducer configuration (from an AWS forum post):
91
+ #
92
+ # m1.small 1
93
+ # m1.large 2
94
+ # m1.xlarge 4
95
+ # c1.medium 2
96
+ # c1.xlarge 4
97
+ def calculate_parallels
98
+ reduce_slots = case @slave_instance_type
99
+ when "m1.small" then 1
100
+ when "m1.large" then 2
101
+ when "m1.xlarge" then 4
102
+ when "c1.medium" then 2
103
+ when "c1.xlarge" then 4
104
+ else 1
105
+ end
106
+ ((@instance_count - 1).to_f * reduce_slots.to_f * 0.9).ceil
107
+ end
64
108
 
65
109
  end
66
110
 
@@ -1,3 +1,3 @@
1
1
  module Elasticity
2
- VERSION = "1.2"
2
+ VERSION = "1.2.1"
3
3
  end
@@ -2,8 +2,8 @@
2
2
  - !ruby/struct:VCR::HTTPInteraction
3
3
  request: !ruby/struct:VCR::Request
4
4
  method: :get
5
- uri: !ruby/regexp /^http:\/\/elasticmapreduce\.amazonaws\.com:80\/\?AWSAccessKeyId=AKIAI7HEMMNKGT6VFFSA&Instances\.Ec2KeyName=sharethrough_dev&Instances\.HadoopVersion=0\.20&Instances\.InstanceCount=2&Instances\.MasterInstanceType=m1\.small&Instances\.SlaveInstanceType=m1\.small&LogUri=s3n:\/\/slif-elasticity\/pig-apache\/logs&Name=Elasticity%20Pig%20Job&Operation=RunJobFlow&.*&Steps\.member\.1\.ActionOnFailure=TERMINATE_JOB_FLOW&Steps\.member\.1\.HadoopJarStep\.Args\.member\.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps\.member\.1\.HadoopJarStep\.Args\.member\.2=--base-path&Steps\.member\.1\.HadoopJarStep\.Args\.member\.3=s3:\/\/elasticmapreduce\/libs\/pig\/&Steps\.member\.1\.HadoopJarStep\.Args\.member\.4=--install-pig&Steps\.member\.1\.HadoopJarStep\.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner\.jar&Steps\.member\.1\.Name=Setup%20Pig&Steps\.member\.2\.ActionOnFailure=TERMINATE_JOB_FLOW&Steps\.member\.2\.HadoopJarStep\.Args\.member\.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps\.member\.2\.HadoopJarStep\.Args\.member\.2=--run-pig-script&Steps\.member\.2\.HadoopJarStep\.Args\.member\.3=--args&Steps\.member\.2\.HadoopJarStep\.Args\.member\.4=-p&Steps\.member\.2\.HadoopJarStep\.Args\.member\.5=INPUT=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/input&Steps\.member\.2\.HadoopJarStep\.Args\.member\.6=-p&Steps\.member\.2\.HadoopJarStep\.Args\.member\.7=OUTPUT=s3n:\/\/slif-elasticity\/pig-apache\/output\/2011-05-04&Steps\.member\.2\.HadoopJarStep\.Args\.member\.8=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/do-reports\.pig&Steps\.member\.2\.HadoopJarStep\.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner\.jar&Steps\.member\.2\.Name=Run%20Pig%20Script/
6
- body:
5
+ uri: !ruby/regexp /^http:\/\/elasticmapreduce.amazonaws.com:80\/\?AWSAccessKeyId=AKIAI7HEMMNKGT6VFFSA&Instances.Ec2KeyName=sharethrough_dev&Instances.HadoopVersion=0.20&Instances.InstanceCount=2&Instances.MasterInstanceType=m1.small&Instances.SlaveInstanceType=m1.small&LogUri=s3n:\/\/slif-elasticity\/pig-apache\/logs&Name=Elasticity%20Pig%20Job&Operation=RunJobFlow&.*&Steps.member.1.ActionOnFailure=TERMINATE_JOB_FLOW&Steps.member.1.HadoopJarStep.Args.member.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps.member.1.HadoopJarStep.Args.member.2=--base-path&Steps.member.1.HadoopJarStep.Args.member.3=s3:\/\/elasticmapreduce\/libs\/pig\/&Steps.member.1.HadoopJarStep.Args.member.4=--install-pig&Steps.member.1.HadoopJarStep.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner.jar&Steps.member.1.Name=Setup%20Pig&Steps.member.2.ActionOnFailure=TERMINATE_JOB_FLOW&Steps.member.2.HadoopJarStep.Args.member.1=s3:\/\/elasticmapreduce\/libs\/pig\/pig-script&Steps.member.2.HadoopJarStep.Args.member.10=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/do-reports.pig&Steps.member.2.HadoopJarStep.Args.member.2=--run-pig-script&Steps.member.2.HadoopJarStep.Args.member.3=--args&Steps.member.2.HadoopJarStep.Args.member.4=-p&Steps.member.2.HadoopJarStep.Args.member.5=INPUT=s3n:\/\/elasticmapreduce\/samples\/pig-apache\/input&Steps.member.2.HadoopJarStep.Args.member.6=-p&Steps.member.2.HadoopJarStep.Args.member.7=OUTPUT=s3n:\/\/slif-elasticity\/pig-apache\/output\/2011-05-04&Steps.member.2.HadoopJarStep.Args.member.8=-p&Steps.member.2.HadoopJarStep.Args.member.9=E_PARALLELS=1&Steps.member.2.HadoopJarStep.Jar=s3:\/\/elasticmapreduce\/libs\/script-runner\/script-runner.jar&Steps.member.2.Name=Run%20Pig%20Script/
6
+ body:
7
7
  headers:
8
8
  accept:
9
9
  - "*/*; q=0.5, application/xml"
@@ -15,20 +15,20 @@
15
15
  message: OK
16
16
  headers:
17
17
  x-amzn-requestid:
18
- - b237df84-761d-11e0-b625-05a26eeda1d8
18
+ - c00d285d-7935-11e0-a51d-7bf947dae271
19
19
  content-type:
20
20
  - text/xml
21
21
  date:
22
- - Wed, 04 May 2011 07:11:13 GMT
22
+ - Sun, 08 May 2011 05:40:58 GMT
23
23
  content-length:
24
24
  - "297"
25
25
  body: |
26
26
  <RunJobFlowResponse xmlns="http://elasticmapreduce.amazonaws.com/doc/2009-03-31">
27
27
  <RunJobFlowResult>
28
- <JobFlowId>j-16PZ24OED71C6</JobFlowId>
28
+ <JobFlowId>j-1HB7A3TBRT3VS</JobFlowId>
29
29
  </RunJobFlowResult>
30
30
  <ResponseMetadata>
31
- <RequestId>b237df84-761d-11e0-b625-05a26eeda1d8</RequestId>
31
+ <RequestId>c00d285d-7935-11e0-a51d-7bf947dae271</RequestId>
32
32
  </ResponseMetadata>
33
33
  </RunJobFlowResponse>
34
34
 
@@ -3,19 +3,78 @@ require 'spec_helper'
3
3
  describe Elasticity::PigJob do
4
4
 
5
5
  describe ".new" do
6
-
7
6
  it "should have good defaults" do
8
- hive = Elasticity::PigJob.new("access", "secret")
9
- hive.aws_access_key_id.should == "access"
10
- hive.aws_secret_access_key.should == "secret"
11
- hive.ec2_key_name.should == "default"
12
- hive.hadoop_version.should == "0.20"
13
- hive.instance_count.should == 2
14
- hive.master_instance_type.should == "m1.small"
15
- hive.name.should == "Elasticity Pig Job"
16
- hive.slave_instance_type.should == "m1.small"
17
- hive.action_on_failure.should == "TERMINATE_JOB_FLOW"
18
- hive.log_uri.should == nil
7
+ pig = Elasticity::PigJob.new("access", "secret")
8
+ pig.aws_access_key_id.should == "access"
9
+ pig.aws_secret_access_key.should == "secret"
10
+ pig.ec2_key_name.should == "default"
11
+ pig.hadoop_version.should == "0.20"
12
+ pig.instance_count.should == 2
13
+ pig.master_instance_type.should == "m1.small"
14
+ pig.name.should == "Elasticity Pig Job"
15
+ pig.slave_instance_type.should == "m1.small"
16
+ pig.action_on_failure.should == "TERMINATE_JOB_FLOW"
17
+ pig.log_uri.should == nil
18
+ pig.parallels.should == 1
19
+ end
20
+ end
21
+
22
+ describe "#instance_count=" do
23
+ it "should not allow instances to be set less than 2" do
24
+ pig = Elasticity::PigJob.new("access", "secret")
25
+ lambda {
26
+ pig.instance_count = 1
27
+ }.should raise_error(ArgumentError, "Instance count cannot be set to less than 2 (requested 1)")
28
+ end
29
+ end
30
+
31
+ describe "calculated value of parallels" do
32
+
33
+ before do
34
+ @pig = Elasticity::PigJob.new("access", "secret")
35
+ @pig.instance_count = 8
36
+ end
37
+
38
+ context "when slave is m1.small" do
39
+ it "should be 7" do
40
+ @pig.slave_instance_type = "m1.small"
41
+ @pig.parallels.should == 7
42
+ end
43
+ end
44
+
45
+ context "when slave is m1.large" do
46
+ it "should be 13" do
47
+ @pig.slave_instance_type = "m1.large"
48
+ @pig.parallels.should == 13
49
+ end
50
+ end
51
+
52
+ context "when slave is c1.medium" do
53
+ it "should be 13" do
54
+ @pig.slave_instance_type = "c1.medium"
55
+ @pig.parallels.should == 13
56
+ end
57
+ end
58
+
59
+ context "when slave is m1.xlarge" do
60
+ it "should be 26" do
61
+ @pig.slave_instance_type = "m1.xlarge"
62
+ @pig.parallels.should == 26
63
+ end
64
+ end
65
+
66
+ context "when slave is c1.xlarge" do
67
+ it "should be 26" do
68
+ @pig.slave_instance_type = "c1.xlarge"
69
+ @pig.parallels.should == 26
70
+ end
71
+ end
72
+
73
+ context "when slave is any other type" do
74
+ it "should be 1" do
75
+ @pig.slave_instance_type = "foo"
76
+ @pig.parallels.should == 7
77
+ end
19
78
  end
20
79
 
21
80
  end
@@ -30,9 +89,9 @@ describe Elasticity::PigJob do
30
89
  :instances => {
31
90
  :ec2_key_name => "default",
32
91
  :hadoop_version => "0.20",
33
- :instance_count => 2,
92
+ :instance_count => 8,
34
93
  :master_instance_type => "m1.small",
35
- :slave_instance_type => "m1.small",
94
+ :slave_instance_type => "m1.xlarge",
36
95
  },
37
96
  :steps => [
38
97
  {
@@ -58,6 +117,7 @@ describe Elasticity::PigJob do
58
117
  "--args",
59
118
  "-p", "OUTPUT=s3n://slif-pig-test/output",
60
119
  "-p", "XREFS=s3n://slif-pig-test/xrefs",
120
+ "-p", "E_PARALLELS=26",
61
121
  "s3n://slif-pig-test/test.pig"
62
122
  ],
63
123
  },
@@ -65,11 +125,15 @@ describe Elasticity::PigJob do
65
125
  }
66
126
  ]
67
127
  }).and_return("new_jobflow_id")
68
- Elasticity::EMR.should_receive(:new).with("access", "secret").and_return(aws)
69
128
 
129
+ Elasticity::EMR.should_receive(:new).with("access", "secret").and_return(aws)
70
130
  pig = Elasticity::PigJob.new("access", "secret")
131
+
71
132
  pig.log_uri = "s3n://slif-test/output/logs"
72
133
  pig.action_on_failure = "CONTINUE"
134
+ pig.instance_count = 8
135
+ pig.slave_instance_type = "m1.xlarge"
136
+
73
137
  jobflow_id = pig.run('s3n://slif-pig-test/test.pig', {
74
138
  'OUTPUT' => 's3n://slif-pig-test/output',
75
139
  'XREFS' => 's3n://slif-pig-test/xrefs'
@@ -89,7 +153,7 @@ describe Elasticity::PigJob do
89
153
  "INPUT" => "s3n://elasticmapreduce/samples/pig-apache/input",
90
154
  "OUTPUT" => "s3n://slif-elasticity/pig-apache/output/2011-05-04"
91
155
  })
92
- jobflow_id.should == "j-16PZ24OED71C6"
156
+ jobflow_id.should == "j-1HB7A3TBRT3VS"
93
157
  end
94
158
  end
95
159
 
metadata CHANGED
@@ -1,12 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: elasticity
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
4
+ hash: 29
5
5
  prerelease:
6
6
  segments:
7
7
  - 1
8
8
  - 2
9
- version: "1.2"
9
+ - 1
10
+ version: 1.2.1
10
11
  platform: ruby
11
12
  authors:
12
13
  - Robert Slifka
@@ -14,7 +15,7 @@ autorequire:
14
15
  bindir: bin
15
16
  cert_chain: []
16
17
 
17
- date: 2011-05-04 00:00:00 -07:00
18
+ date: 2011-05-07 00:00:00 -07:00
18
19
  default_executable:
19
20
  dependencies:
20
21
  - !ruby/object:Gem::Dependency