rp-emr 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +4 -0
- data/.rspec +1 -0
- data/CHANGELOG.md +18 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +23 -0
- data/README.md +240 -0
- data/Rakefile +6 -0
- data/bin/emr +5 -0
- data/lib/rp/emr/bootstrap_action.rb +38 -0
- data/lib/rp/emr/cli.rb +249 -0
- data/lib/rp/emr/instance_group.rb +36 -0
- data/lib/rp/emr/instance_groups.rb +66 -0
- data/lib/rp/emr/instances.rb +36 -0
- data/lib/rp/emr/job.rb +42 -0
- data/lib/rp/emr/step/pig.rb +84 -0
- data/lib/rp/emr/step/s3_dist_cp.rb +93 -0
- data/lib/rp/emr/step/setup_debugging.rb +28 -0
- data/lib/rp/emr/step/setup_hive.rb +36 -0
- data/lib/rp/emr/step/setup_pig.rb +36 -0
- data/lib/rp/emr/step.rb +21 -0
- data/lib/rp/emr/version.rb +5 -0
- data/lib/rp/emr.rb +26 -0
- data/rp-emr.gemspec +31 -0
- data/spec/rp/emr/bootstrap_action_spec.rb +23 -0
- data/spec/rp/emr/instance_group_spec.rb +51 -0
- data/spec/rp/emr/instance_groups_spec.rb +106 -0
- data/spec/rp/emr/instances_spec.rb +23 -0
- data/spec/rp/emr/job_spec.rb +31 -0
- data/spec/rp/emr/step/pig_spec.rb +136 -0
- data/spec/rp/emr/step/s3_dist_cp_step_spec.rb +83 -0
- data/spec/rp/emr/step/setup_debugging_spec.rb +29 -0
- data/spec/rp/emr/step/setup_pig_spec.rb +47 -0
- data/spec/rp/emr/step_spec.rb +33 -0
- data/spec/rp/emr_spec.rb +5 -0
- data/spec/spec_helper.rb +10 -0
- metadata +221 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 608a51bf27824ca9856bbdc548a0d810b0e5af65
|
4
|
+
data.tar.gz: 0f4d6fe64878316614c9bccafab5cee6e553bbb8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b6d2180e2f2bc109ffca80e3f531297dad50f94f7dfef79ce8bc21ac2010162e09fa2d7d87691031cf00784de4c93d71704822aec102800edce5d36c73609b1c
|
7
|
+
data.tar.gz: 213a4f7215cf3848cef5986a3c727e131f6494c32806b336cbf1c134b89b0330688e6ea4997be8783faa6a652d0a870dac105f30ff55711e5b2edef33e175e33
|
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# 1.0.3
|
2
|
+
|
3
|
+
* Implement SetupHive step
|
4
|
+
|
5
|
+
# 1.0.2
|
6
|
+
|
7
|
+
* Add service_role option when launching a Job.
|
8
|
+
* BUG: Fix job_flow_role option to be properly set when passed to Job in CLI launcher.
|
9
|
+
|
10
|
+
# 1.0.1
|
11
|
+
|
12
|
+
* `add_setup_pig_step_method_options` now allows `pig_version` to be configured
|
13
|
+
* `S3DistCp` now allows for the `s3_distcp_jar` to be configured
|
14
|
+
|
15
|
+
# 1.0.0
|
16
|
+
|
17
|
+
* 1 major enhancement
|
18
|
+
* Birthday!
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
Copyright (c) 2013 ReturnPath, Inc.
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
|
data/README.md
ADDED
@@ -0,0 +1,240 @@
|
|
1
|
+
# ReturnPath EMR Tools
|
2
|
+
|
3
|
+
This is a Ruby library for creating & launching jobs on AWS's Elastic MapReduce
|
4
|
+
service. The library provides two basic tools: a set of classes to encapsulate
|
5
|
+
the data structures expected by the EMR client, and a set of Thor helpers to
|
6
|
+
simplify building job launchers.
|
7
|
+
|
8
|
+
## Client Wrapper
|
9
|
+
|
10
|
+
The AWS EMR client is very low level, and basically expects a hash of values.
|
11
|
+
rp-emr provides wrappers for the basic data types and some helpers for building
|
12
|
+
collections. All objects are built using the
|
13
|
+
[assembler](https://github.com/benhamill/assembler) gem, so you can mix values
|
14
|
+
between method-call syntax and builder-block syntax.
|
15
|
+
|
16
|
+
The basic bits look like this:
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
# Executes a script before the cluster starts processing steps
|
20
|
+
bootstrap_action = RP::EMR::BootstrapAction.new(name: 'action name') do |a|
|
21
|
+
a.path = 's3://path_to_script_to_run'
|
22
|
+
a.args = ['--option value', '--other-option value']
|
23
|
+
end
|
24
|
+
|
25
|
+
# Runs a hadoop jar. This is the bare-bones version, you'll probably want to
|
26
|
+
# use one of the classes in lib/rp/emr/step
|
27
|
+
step = RP::EMR::Step.new(name: 'step name') do |s|
|
28
|
+
s.action_on_failure = 'CANCEL_AND_WAIT'
|
29
|
+
s.hadoop_jar_step = {
|
30
|
+
jar: 's3://path_to_jar',
|
31
|
+
args: ['--option value', '--other-option value'],
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
# Runs a pig script
|
36
|
+
pig_step = RP::EMR::Step::Pig.new(name: 'pig step') do |s|
|
37
|
+
s.script_path = '/local/path/to/pig_script.pig'
|
38
|
+
s.script_bucket = 'bucket_to_upload_script_to'
|
39
|
+
s.args = ['--args_to_append_to_job']
|
40
|
+
s.pig_params = {'PIG_PARAM' => 'value'}
|
41
|
+
s.pig_version = '0.11.1.1'
|
42
|
+
s.action_on_failure = 'CANCEL_AND_WAIT'
|
43
|
+
s.dry_run = false
|
44
|
+
end
|
45
|
+
|
46
|
+
# There are also steps for setting up pig, setting up debugging, using S3DistCP, etc
|
47
|
+
|
48
|
+
# Creates an instance group. As with RP::EMR::Step, you probably shouldn't be
|
49
|
+
# using this directly, just RP::EMR::InstanceGroups instead
|
50
|
+
instance_group = RP::EMR::InstanceGroup.new(name: 'custom instance group') do |ig|
|
51
|
+
ig.instance_role = 'MASTER'
|
52
|
+
ig.instance_type = 'c1.medium'
|
53
|
+
ig.instance_count = 100
|
54
|
+
ig.market = 'SPOT'
|
55
|
+
ig.bid_price = 2.0
|
56
|
+
end
|
57
|
+
|
58
|
+
# Defines the different instances groups to be used. All the options for
|
59
|
+
# RP::EMR::InstanceGroup are supported, along with a defulat instance type
|
60
|
+
instance_groups = RP::EMR::InstanceGroups.new do |ig|
|
61
|
+
ig.default_instance_type = 'c1.medium'
|
62
|
+
|
63
|
+
ig.master_instance_type = 'c3.xlarge'
|
64
|
+
|
65
|
+
ig.core_instance_count = 5
|
66
|
+
|
67
|
+
ig.task_instance_count = 100
|
68
|
+
ig.task_instance_market = 'SPOT'
|
69
|
+
ig.task_bid_price = 2.0
|
70
|
+
end
|
71
|
+
|
72
|
+
# Top-level instance definition
|
73
|
+
instances = RP::EMR::Instances.new do |i|
|
74
|
+
i.instance_groups = instance_groups
|
75
|
+
i.ec2_key_name = 'my_key_name'
|
76
|
+
i.hadoop_version = '2.0'
|
77
|
+
end
|
78
|
+
|
79
|
+
# Now we can construct the actual job
|
80
|
+
job = RP::EMR::Job.new do |j|
|
81
|
+
j.instances = instances
|
82
|
+
j.steps = [step, pig_step]
|
83
|
+
j.ami_version = :latest
|
84
|
+
j.bootstrap_actions = [bootstrap_action]
|
85
|
+
j.visible_to_all_users = true
|
86
|
+
j.job_flow_role = 'MyIAMRole'
|
87
|
+
j.tags = ['analysis']
|
88
|
+
end
|
89
|
+
|
90
|
+
# Launch the job using the AWS API
|
91
|
+
AWS::EMR.new.job_flows.create('job_name', job.to_hash)
|
92
|
+
```
|
93
|
+
|
94
|
+
|
95
|
+
## Thor Helpers
|
96
|
+
|
97
|
+
The API wrapper is all fine and dandy, but it's still a pain to work with. So
|
98
|
+
there's a set of Thor helpers to make building jobs easier - they define things
|
99
|
+
like defaults, option parsing, and other goodness.
|
100
|
+
|
101
|
+
The gem installs an script called `emr` which provides basic options if you want
|
102
|
+
to build jobs interactively
|
103
|
+
|
104
|
+
```bash
|
105
|
+
bundle exec emr help
|
106
|
+
> Commands:
|
107
|
+
> emr add_pig_script_step JOB_ID SCRIPT_PATH # Add a Pig script step to an existing job
|
108
|
+
> emr add_rollup_step JOB_ID INPUT OUTPUT # Add a S3DistCp rollup step to an existing job
|
109
|
+
> emr add_setup_pig_step JOB_ID # Add a setup pig step to an existing job
|
110
|
+
> emr create_job JOB_NAME # Create an EMR job
|
111
|
+
> emr help [COMMAND] # Describe available commands or one specific command
|
112
|
+
>
|
113
|
+
> Options:
|
114
|
+
> -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
|
115
|
+
> -v, [--verbose], [--no-verbose] # Print lots of stuff
|
116
|
+
> [--dry-run], [--no-dry-run] # Don't actually talk to AWS
|
117
|
+
```
|
118
|
+
|
119
|
+
While these can be useful, the real goal is to make it easy to roll your own
|
120
|
+
CLI using these as building blocks. This is accomplished by providing class-level
|
121
|
+
helpers to import the options used for each step, allowing you to invoke them
|
122
|
+
as modular components.
|
123
|
+
|
124
|
+
For example:
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
#!/usr/bin/env ruby
|
128
|
+
|
129
|
+
require 'rp/emr'
|
130
|
+
require 'thor'
|
131
|
+
|
132
|
+
class ExampleCLI < Thor
|
133
|
+
# This brings all the class-level helpers in
|
134
|
+
extend RP::EMR::CLI::TaskOptions
|
135
|
+
|
136
|
+
# Creates shared options like --dry-run and --verbose
|
137
|
+
cli_class_options
|
138
|
+
|
139
|
+
# We're going to write a CLI for launching a pig script. The first thing
|
140
|
+
# we do is give it a name (this is standard Thor)
|
141
|
+
desc "pig", "Test a pig script"
|
142
|
+
|
143
|
+
# We'll need to launch a cluster to do our computation with. This method adds
|
144
|
+
# the options we'll use to create the cluster. Values passed to the method are
|
145
|
+
# used as the defaults
|
146
|
+
create_job_method_options(
|
147
|
+
default_instance_type: 'm1.large',
|
148
|
+
core_instance_count: 2,
|
149
|
+
task_instance_count: 6,
|
150
|
+
job_flow_role: 'MyIAMRole',
|
151
|
+
)
|
152
|
+
|
153
|
+
# Here we're importing the options used to control how Pig is setup
|
154
|
+
add_setup_pig_step_method_options
|
155
|
+
|
156
|
+
# And here were importing options used to create a Pig step generally
|
157
|
+
add_pig_script_step_method_options(
|
158
|
+
script_bucket: 'my-emr-scripts-bucket',
|
159
|
+
)
|
160
|
+
|
161
|
+
# Let's define some options specific to the task we're trying to complete
|
162
|
+
method_option :output, default: 'counted_words'
|
163
|
+
def pig
|
164
|
+
script_path = File.expand_path('../count_words.pig', __FILE__)
|
165
|
+
input_path = "s3://my-input-bucket/words"
|
166
|
+
output_path = "s3://my-output-bucket/#{options[:output]}/#{Time.now.to_i}"
|
167
|
+
|
168
|
+
# These will be available in our Pig script as '$INPUT' and '$OUTPUT'
|
169
|
+
pig_step_args = { pig_params: options[:pig_params].merge(
|
170
|
+
'INPUT' => input_path,
|
171
|
+
'OUTPUT' => output_path,
|
172
|
+
)}
|
173
|
+
|
174
|
+
# Now that we've constructed our options, we'll use the Thor task in lib/rp/emr/cli
|
175
|
+
# to create a job flow. The task returns the job identifier, and we're passing
|
176
|
+
# the options hash that Thor parsed for us (this is why we did all that setup
|
177
|
+
# earlier)
|
178
|
+
job_id = invoke 'emr:create_job', ['Word Count Job'], options
|
179
|
+
|
180
|
+
# The job has been created, so we'll add a step to setup pig
|
181
|
+
invoke 'emr:add_setup_pig_step', [job_id], options
|
182
|
+
|
183
|
+
# And finally we'll add our pig script. Notice that we're merging the pig
|
184
|
+
# args into the options hash. We could also have passed these options as CLI
|
185
|
+
# options - this lets us to complicated stuff like date coersions in Ruby
|
186
|
+
invoke 'emr:add_pig_script_step', [job_id, script_path], options.merge(pig_step_args)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
ExampleCLI.start
|
191
|
+
```
|
192
|
+
|
193
|
+
Now, we can get a nice help page describing all the options available to us
|
194
|
+
|
195
|
+
```bash
|
196
|
+
bundle exec ./word_count_cli --help
|
197
|
+
> Commands:
|
198
|
+
> word_count_cli help [COMMAND] # Describe available commands or one specific command
|
199
|
+
> work_count_cli pig # Test a pig script
|
200
|
+
>
|
201
|
+
> Options:
|
202
|
+
> -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
|
203
|
+
> -v, [--verbose], [--no-verbose] # Print lots of stuff
|
204
|
+
> [--dry-run], [--no-dry-run] # Don't actually talk to AWS
|
205
|
+
|
206
|
+
bundle exec ./word_count_cli help pig
|
207
|
+
> Usage:
|
208
|
+
> word_count_cli pig
|
209
|
+
>
|
210
|
+
> Options:
|
211
|
+
> -k, [--ec2-key-name=KEY_NAME] # An AWS keypair for the cluster. Useful if you want to shell into the cluster
|
212
|
+
> [--default-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster
|
213
|
+
> # Default: m1.large
|
214
|
+
> [--master-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster master group
|
215
|
+
> [--master-instance-count=N] # The number of task instances to create in the cluster master group
|
216
|
+
> [--core-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster core group
|
217
|
+
> [--core-instance-count=N] # The number of task instances to create in the cluster core group
|
218
|
+
> # Default: 2
|
219
|
+
> [--task-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster task group
|
220
|
+
> [--task-instance-count=N] # The number of task instances to create in the cluster task group
|
221
|
+
> # Default: 6
|
222
|
+
> [--task-bid-price=N.NN] # If set, will use spot instances for task trackers with this bid price
|
223
|
+
> [--job-flow-role=IAM_ROLE] # IAM Role for the job flow
|
224
|
+
> # Default: MyIAMRole
|
225
|
+
> [--script-bucket=BUCKET] # The S3 bucket to use for storing the Pig script
|
226
|
+
> # Default: my-emr-scripts-bucket
|
227
|
+
> -p, [--pig-params=PARAM:VALUE] # Parameters to be passed to the pig script
|
228
|
+
> [--output=OUTPUT]
|
229
|
+
> -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
|
230
|
+
> -v, [--verbose], [--no-verbose] # Print lots of stuff
|
231
|
+
> [--dry-run], [--no-dry-run] # Don't actually talk to AWS
|
232
|
+
|
233
|
+
bundle exec ./word_count_cli pig --ouput foo --dry-run
|
234
|
+
> -----------
|
235
|
+
> Created job flow job_flow_id with ["Word Count Job"], {"keep_alive"=>false, "verbose"=>false, "dry_run"=>true, ...}
|
236
|
+
> -----------
|
237
|
+
> Added setup pig step to job_flow_id with ["job_flow_id"], {"keep_alive"=>false, "verbose"=>false, "dry_run"=>true, ...}
|
238
|
+
> -----------
|
239
|
+
> Added pig script step to job_flow_id with ["job_flow_id", "count_words.pig"], {"keep_alive"=>false, ...}
|
240
|
+
```
|
data/Rakefile
ADDED
data/bin/emr
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
module RP
|
2
|
+
module EMR
|
3
|
+
# Bootstrap action wrapper
|
4
|
+
#
|
5
|
+
# @example
|
6
|
+
# def bootstrap_hadoop
|
7
|
+
# RP::EMR::BootstrapAction.new(
|
8
|
+
# name: 'Configure Hadoop',
|
9
|
+
# path: 's3://elasticmapreduce/bootstrap-actions/configure-hadoop',
|
10
|
+
# args: ['-c', 'fs.s3n.multipart.uploads.enabled=false']
|
11
|
+
# )
|
12
|
+
# end
|
13
|
+
#
|
14
|
+
# def bootstrap_daemons
|
15
|
+
# RP::EMR::BootstrapAction.new(
|
16
|
+
# name: 'Configure Daemons',
|
17
|
+
# path: 's3://elasticmapreduce/bootstrap-actions/configure-daemons',
|
18
|
+
# args: ['--namenode-heap-size=15000'],
|
19
|
+
# )
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
class BootstrapAction
|
23
|
+
extend Assembler
|
24
|
+
|
25
|
+
assemble_from :name, :path, args: []
|
26
|
+
|
27
|
+
def to_hash
|
28
|
+
{
|
29
|
+
name: name,
|
30
|
+
script_bootstrap_action: {
|
31
|
+
path: path,
|
32
|
+
args: args,
|
33
|
+
},
|
34
|
+
}
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/rp/emr/cli.rb
ADDED
@@ -0,0 +1,249 @@
|
|
1
|
+
module RP
|
2
|
+
module EMR
|
3
|
+
class CLI < Thor
|
4
|
+
module TaskOptions
|
5
|
+
def cli_class_options
|
6
|
+
class_option :keep_alive, aliases: '-a', default: false, type: :boolean, desc: 'Set to true if you want the cluster to stay alive after completion/failure'
|
7
|
+
class_option :verbose, aliases: '-v', default: false, type: :boolean, desc: 'Print lots of stuff'
|
8
|
+
class_option :dry_run, default: false, type: :boolean, desc: "Don't actually talk to AWS"
|
9
|
+
end
|
10
|
+
|
11
|
+
def create_job_method_options(defaults = {})
|
12
|
+
method_option(:ec2_key_name,
|
13
|
+
default: defaults[:ec2_key_name],
|
14
|
+
aliases: '-k',
|
15
|
+
banner: 'KEY_NAME',
|
16
|
+
desc: "An AWS keypair for the cluster. Useful if you want to shell into the cluster",
|
17
|
+
)
|
18
|
+
method_option(:default_instance_type,
|
19
|
+
default: defaults[:default_instance_type],
|
20
|
+
banner: 'INSTANCE_TYPE',
|
21
|
+
desc: "The EC2 instance type to use for the cluster",
|
22
|
+
)
|
23
|
+
method_option(:master_instance_type,
|
24
|
+
default: defaults[:master_instance_type],
|
25
|
+
banner: 'INSTANCE_TYPE',
|
26
|
+
desc: "The EC2 instance type to use for the cluster master group",
|
27
|
+
)
|
28
|
+
method_option(:master_instance_count,
|
29
|
+
default: defaults[:master_instance_count],
|
30
|
+
type: :numeric,
|
31
|
+
banner: 'N',
|
32
|
+
desc: "The number of task instances to create in the cluster master group",
|
33
|
+
)
|
34
|
+
method_option(:core_instance_type,
|
35
|
+
default: defaults[:core_instance_type],
|
36
|
+
banner: 'INSTANCE_TYPE',
|
37
|
+
desc: "The EC2 instance type to use for the cluster core group",
|
38
|
+
)
|
39
|
+
method_option(:core_instance_count,
|
40
|
+
default: defaults[:core_instance_count],
|
41
|
+
type: :numeric,
|
42
|
+
banner: 'N',
|
43
|
+
desc: "The number of task instances to create in the cluster core group",
|
44
|
+
)
|
45
|
+
method_option(:task_instance_type,
|
46
|
+
default: defaults[:task_instance_type],
|
47
|
+
banner: 'INSTANCE_TYPE',
|
48
|
+
desc: "The EC2 instance type to use for the cluster task group",
|
49
|
+
)
|
50
|
+
method_option(:task_instance_count,
|
51
|
+
default: defaults[:task_instance_count],
|
52
|
+
type: :numeric,
|
53
|
+
banner: 'N',
|
54
|
+
desc: "The number of task instances to create in the cluster task group",
|
55
|
+
)
|
56
|
+
method_option(:task_bid_price,
|
57
|
+
default: defaults[:task_bid_price],
|
58
|
+
type: :numeric,
|
59
|
+
banner: 'N.NN',
|
60
|
+
desc: "If set, will use spot instances for task trackers with this bid price",
|
61
|
+
)
|
62
|
+
method_option(:job_flow_role,
|
63
|
+
default: defaults[:job_flow_role],
|
64
|
+
banner: 'IAM_ROLE',
|
65
|
+
desc: "IAM Role for the job flow",
|
66
|
+
)
|
67
|
+
method_option(:service_role,
|
68
|
+
default: defaults[:service_role],
|
69
|
+
banner: 'IAM_ROLE',
|
70
|
+
desc: "IAM Role for the service",
|
71
|
+
)
|
72
|
+
end
|
73
|
+
|
74
|
+
def add_setup_pig_step_method_options(defaults = {})
|
75
|
+
method_option(:pig_version,
|
76
|
+
default: defaults[:pig_version] || '0.11.1.1',
|
77
|
+
desc: 'Version of Pig to install'
|
78
|
+
)
|
79
|
+
end
|
80
|
+
|
81
|
+
def add_setup_hive_step_method_options(defaults = {})
|
82
|
+
method_option(:hive_version,
|
83
|
+
default: defaults[:hive_version] || 'latest',
|
84
|
+
desc: 'Version of Hive to install'
|
85
|
+
)
|
86
|
+
end
|
87
|
+
|
88
|
+
def add_rollup_step_method_options(defaults = {})
|
89
|
+
method_option(:rollup_input_pattern,
|
90
|
+
default: defaults[:rollup_input_pattern],
|
91
|
+
desc: 'Java-compatable regex to filter input',
|
92
|
+
)
|
93
|
+
method_option(:rollup_group_by,
|
94
|
+
default: defaults[:rollup_group_by],
|
95
|
+
desc: 'Java-compatable regex with a single capture group',
|
96
|
+
)
|
97
|
+
method_option(:rollup_target_size,
|
98
|
+
default: defaults[:rollup_target_size],
|
99
|
+
type: :numeric,
|
100
|
+
desc: 'The target file size for rolled up files',
|
101
|
+
)
|
102
|
+
end
|
103
|
+
|
104
|
+
def add_pig_script_step_method_options(defaults = {})
|
105
|
+
method_option(:script_bucket,
|
106
|
+
default: defaults[:script_bucket],
|
107
|
+
banner: 'BUCKET',
|
108
|
+
desc: 'The S3 bucket to use for storing the Pig script',
|
109
|
+
)
|
110
|
+
method_option(:pig_params,
|
111
|
+
default: defaults[:pig_params] || {},
|
112
|
+
aliases: '-p',
|
113
|
+
type: :hash,
|
114
|
+
banner: 'PARAM:VALUE',
|
115
|
+
desc: 'Parameters to be passed to the pig script',
|
116
|
+
)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
extend TaskOptions
|
121
|
+
|
122
|
+
namespace :emr
|
123
|
+
|
124
|
+
cli_class_options
|
125
|
+
|
126
|
+
desc "create_job JOB_NAME", "Create an EMR job"
|
127
|
+
create_job_method_options
|
128
|
+
def create_job(job_name, *)
|
129
|
+
instances = RP::EMR::Instances.new do |i|
|
130
|
+
i.hadoop_version = '2.2.0'
|
131
|
+
i.ec2_key_name = options[:ec2_key_name] if options[:ec2_key_name]
|
132
|
+
i.keep_job_flow_alive_when_no_steps = options[:keep_alive]
|
133
|
+
|
134
|
+
i.instance_groups = RP::EMR::InstanceGroups.new do |ig|
|
135
|
+
ig.default_instance_type = options[:default_instance_type] if options[:default_instance_type]
|
136
|
+
|
137
|
+
ig.master_instance_type = options[:master_instance_type] if options[:master_instance_type]
|
138
|
+
ig.master_instance_count = options[:master_instance_count] if options[:master_instance_count]
|
139
|
+
|
140
|
+
ig.core_instance_type = options[:core_instance_type] if options[:core_instance_type]
|
141
|
+
ig.core_instance_count = options[:core_instance_count] if options[:core_instance_count]
|
142
|
+
|
143
|
+
ig.task_instance_type = options[:task_instance_type] if options[:task_instance_type]
|
144
|
+
ig.task_instance_count = options[:task_instance_count] if options[:task_instance_count]
|
145
|
+
ig.task_bid_price = options[:task_bid_price] if options[:task_bid_price]
|
146
|
+
end.to_a
|
147
|
+
end
|
148
|
+
|
149
|
+
setup_debugging_step = RP::EMR::Step::SetupDebugging.new do |s|
|
150
|
+
s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
|
151
|
+
end
|
152
|
+
|
153
|
+
job = RP::EMR::Job.new do |job|
|
154
|
+
job.log_uri = "s3://oib-mapreduce/logs/mosaic_analysis/#{job_name.underscore}"
|
155
|
+
job.instances = instances.to_hash
|
156
|
+
job.steps = [setup_debugging_step.to_hash]
|
157
|
+
job.job_flow_role = options[:job_flow_role] if options[:job_flow_role]
|
158
|
+
job.service_role = options[:service_role] if options[:service_role]
|
159
|
+
end
|
160
|
+
|
161
|
+
if options[:dry_run]
|
162
|
+
job_flow = OpenStruct.new(id: 'job_flow_id')
|
163
|
+
else
|
164
|
+
job_flow = AWS::EMR.new.job_flows.create(job_name, job.to_hash)
|
165
|
+
end
|
166
|
+
puts '-----------'
|
167
|
+
puts "Created job flow #{job_flow.id} with #{args}, #{options}"
|
168
|
+
pp job.to_hash if options[:verbose]
|
169
|
+
|
170
|
+
return job_flow.id
|
171
|
+
end
|
172
|
+
|
173
|
+
desc "add_setup_pig_step JOB_ID", "Add a setup pig step to an existing job"
|
174
|
+
add_setup_pig_step_method_options
|
175
|
+
def add_setup_pig_step(job_id, *)
|
176
|
+
job = AWS::EMR.new.job_flows[job_id]
|
177
|
+
|
178
|
+
step = RP::EMR::Step::SetupPig.new do |s|
|
179
|
+
s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
|
180
|
+
s.pig_version = options[:pig_version] if options[:pig_version]
|
181
|
+
end
|
182
|
+
|
183
|
+
job.add_steps([step.to_hash]) unless options[:dry_run]
|
184
|
+
puts '-----------'
|
185
|
+
puts "Added setup pig step to #{job.id} with #{args}, #{options}"
|
186
|
+
pp step.to_hash if options[:verbose]
|
187
|
+
end
|
188
|
+
|
189
|
+
desc "add_setup_hive_step JOB_ID", "Add a setup hive step to an existing job"
|
190
|
+
add_setup_hive_step_method_options
|
191
|
+
def add_setup_hive_step(job_id, *)
|
192
|
+
job = AWS::EMR.new.job_flows[job_id]
|
193
|
+
|
194
|
+
step = RP::EMR::Step::SetupHive.new do |s|
|
195
|
+
s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
|
196
|
+
s.hive_version = options[:hive_version] if options[:hive_version]
|
197
|
+
end
|
198
|
+
|
199
|
+
job.add_steps([step.to_hash]) unless options[:dry_run]
|
200
|
+
puts '-----------'
|
201
|
+
puts "Added setup hive step to #{job.id} with #{args}, #{options}"
|
202
|
+
pp step.to_hash if options[:verbose]
|
203
|
+
end
|
204
|
+
|
205
|
+
desc "add_rollup_step JOB_ID INPUT OUTPUT", "Add a S3DistCp rollup step to an existing job"
|
206
|
+
add_rollup_step_method_options
|
207
|
+
def add_rollup_step(job_id, input, output, *)
|
208
|
+
job = AWS::EMR.new.job_flows[job_id]
|
209
|
+
|
210
|
+
step = RP::EMR::Step::S3DistCp.new(
|
211
|
+
name: 'Rollup',
|
212
|
+
src: input,
|
213
|
+
dest: output,
|
214
|
+
) do |s|
|
215
|
+
s.srcPattern = options[:rollup_input_pattern] if options[:rollup_input_pattern]
|
216
|
+
s.groupBy = options[:rollup_group_by] if options[:rollup_group_by]
|
217
|
+
s.targetSize = options[:rollup_target_size] if options[:rollup_target_size]
|
218
|
+
s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
|
219
|
+
end
|
220
|
+
|
221
|
+
job.add_steps([step.to_hash]) unless options[:dry_run]
|
222
|
+
puts '-----------'
|
223
|
+
puts "Added rollup step to #{job.id} with #{args}, #{options}"
|
224
|
+
pp step.to_hash if options[:verbose]
|
225
|
+
end
|
226
|
+
|
227
|
+
desc "add_pig_script_step JOB_ID SCRIPT_PATH", "Add a Pig script step to an existing job"
|
228
|
+
add_pig_script_step_method_options
|
229
|
+
def add_pig_script_step(job_id, script_path, *)
|
230
|
+
job = AWS::EMR.new.job_flows[job_id]
|
231
|
+
|
232
|
+
step = RP::EMR::Step::Pig.new(
|
233
|
+
name: 'Pig',
|
234
|
+
script_path: script_path,
|
235
|
+
script_bucket: options[:script_bucket],
|
236
|
+
) do |s|
|
237
|
+
s.pig_params = options[:pig_params] if options[:pig_params]
|
238
|
+
s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
|
239
|
+
s.dry_run = options[:dry_run]
|
240
|
+
end
|
241
|
+
|
242
|
+
job.add_steps([step.to_hash]) unless options[:dry_run]
|
243
|
+
puts '-----------'
|
244
|
+
puts "Added pig script step to #{job.id} with #{args}, #{options}"
|
245
|
+
pp step.to_hash if options[:verbose]
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module RP
|
2
|
+
module EMR
|
3
|
+
class InstanceGroup
|
4
|
+
extend Assembler
|
5
|
+
|
6
|
+
assemble_from(
|
7
|
+
# Required params
|
8
|
+
:instance_role,
|
9
|
+
:instance_type,
|
10
|
+
:instance_count,
|
11
|
+
|
12
|
+
# Optional params
|
13
|
+
name: nil,
|
14
|
+
market: nil,
|
15
|
+
bid_price: nil,
|
16
|
+
)
|
17
|
+
|
18
|
+
def to_hash
|
19
|
+
{
|
20
|
+
name: name,
|
21
|
+
market: market,
|
22
|
+
instance_role: instance_role,
|
23
|
+
bid_price: bid_price.to_s,
|
24
|
+
instance_type: instance_type,
|
25
|
+
instance_count: instance_count,
|
26
|
+
}.reject { |k,v| !v || (v.respond_to?(:empty?) && v.empty?) }
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def market
|
32
|
+
bid_price ? 'SPOT' : @market
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|