rp-emr 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +4 -0
- data/.rspec +1 -0
- data/CHANGELOG.md +18 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +23 -0
- data/README.md +240 -0
- data/Rakefile +6 -0
- data/bin/emr +5 -0
- data/lib/rp/emr/bootstrap_action.rb +38 -0
- data/lib/rp/emr/cli.rb +249 -0
- data/lib/rp/emr/instance_group.rb +36 -0
- data/lib/rp/emr/instance_groups.rb +66 -0
- data/lib/rp/emr/instances.rb +36 -0
- data/lib/rp/emr/job.rb +42 -0
- data/lib/rp/emr/step/pig.rb +84 -0
- data/lib/rp/emr/step/s3_dist_cp.rb +93 -0
- data/lib/rp/emr/step/setup_debugging.rb +28 -0
- data/lib/rp/emr/step/setup_hive.rb +36 -0
- data/lib/rp/emr/step/setup_pig.rb +36 -0
- data/lib/rp/emr/step.rb +21 -0
- data/lib/rp/emr/version.rb +5 -0
- data/lib/rp/emr.rb +26 -0
- data/rp-emr.gemspec +31 -0
- data/spec/rp/emr/bootstrap_action_spec.rb +23 -0
- data/spec/rp/emr/instance_group_spec.rb +51 -0
- data/spec/rp/emr/instance_groups_spec.rb +106 -0
- data/spec/rp/emr/instances_spec.rb +23 -0
- data/spec/rp/emr/job_spec.rb +31 -0
- data/spec/rp/emr/step/pig_spec.rb +136 -0
- data/spec/rp/emr/step/s3_dist_cp_step_spec.rb +83 -0
- data/spec/rp/emr/step/setup_debugging_spec.rb +29 -0
- data/spec/rp/emr/step/setup_pig_spec.rb +47 -0
- data/spec/rp/emr/step_spec.rb +33 -0
- data/spec/rp/emr_spec.rb +5 -0
- data/spec/spec_helper.rb +10 -0
- metadata +221 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 608a51bf27824ca9856bbdc548a0d810b0e5af65
|
4
|
+
data.tar.gz: 0f4d6fe64878316614c9bccafab5cee6e553bbb8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b6d2180e2f2bc109ffca80e3f531297dad50f94f7dfef79ce8bc21ac2010162e09fa2d7d87691031cf00784de4c93d71704822aec102800edce5d36c73609b1c
|
7
|
+
data.tar.gz: 213a4f7215cf3848cef5986a3c727e131f6494c32806b336cbf1c134b89b0330688e6ea4997be8783faa6a652d0a870dac105f30ff55711e5b2edef33e175e33
|
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# 1.0.3
|
2
|
+
|
3
|
+
* Implement SetupHive step
|
4
|
+
|
5
|
+
# 1.0.2
|
6
|
+
|
7
|
+
* Add service_role option when launching a Job.
|
8
|
+
* BUG: Fix job_flow_role option to be properly set when passed to Job in CLI launcher.
|
9
|
+
|
10
|
+
# 1.0.1
|
11
|
+
|
12
|
+
* `add_setup_pig_step_method_options` now allows `pig_version` to be configured
|
13
|
+
* `S3DistCp` now allows for the `s3_distcp_jar` to be configured
|
14
|
+
|
15
|
+
# 1.0.0
|
16
|
+
|
17
|
+
* 1 major enhancement
|
18
|
+
* Birthday!
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
Copyright (c) 2013 ReturnPath, Inc.
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
|
data/README.md
ADDED
@@ -0,0 +1,240 @@
|
|
1
|
+
# ReturnPath EMR Tools
|
2
|
+
|
3
|
+
This is a Ruby library for creating & launching jobs on AWS's Elastic MapReduce
|
4
|
+
service. The library provides two basic tools: a set of classes to encapsulate
|
5
|
+
the data structures expected by the EMR client, and a set of Thor helpers to
|
6
|
+
simplify building job launchers.
|
7
|
+
|
8
|
+
## Client Wrapper
|
9
|
+
|
10
|
+
The AWS EMR client is very low level, and basically expects a hash of values.
|
11
|
+
rp-emr provides wrappers for the basic data types and some helpers for building
|
12
|
+
collections. All objects are built using the
|
13
|
+
[assembler](https://github.com/benhamill/assembler) gem, so you can mix values
|
14
|
+
between method-call syntax and builder-block syntax.
|
15
|
+
|
16
|
+
The basic bits look like this:
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
# Executes a script before the cluster starts processing steps
|
20
|
+
bootstrap_action = RP::EMR::BootstrapAction.new(name: 'action name') do |a|
|
21
|
+
a.path = 's3://path_to_script_to_run'
|
22
|
+
a.args = ['--option value', '--other-option value']
|
23
|
+
end
|
24
|
+
|
25
|
+
# Runs a hadoop jar. This is the bare-bones version, you'll probably want to
|
26
|
+
# use one of the classes in lib/rp/emr/step
|
27
|
+
step = RP::EMR::Step.new(name: 'step name') do |s|
|
28
|
+
s.action_on_failure = 'CANCEL_AND_WAIT'
|
29
|
+
s.hadoop_jar_step = {
|
30
|
+
jar: 's3://path_to_jar',
|
31
|
+
args: ['--option value', '--other-option value'],
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
# Runs a pig script
|
36
|
+
pig_step = RP::EMR::Step::Pig.new(name: 'pig step') do |s|
|
37
|
+
s.script_path = '/local/path/to/pig_script.pig'
|
38
|
+
s.script_bucket = 'bucket_to_upload_script_to'
|
39
|
+
s.args = ['--args_to_append_to_job']
|
40
|
+
s.pig_params = {'PIG_PARAM' => 'value'}
|
41
|
+
s.pig_version = '0.11.1.1'
|
42
|
+
s.action_on_failure = 'CANCEL_AND_WAIT'
|
43
|
+
s.dry_run = false
|
44
|
+
end
|
45
|
+
|
46
|
+
# There are also steps for setting up pig, setting up debugging, using S3DistCP, etc
|
47
|
+
|
48
|
+
# Creates an instance group. As with RP::EMR::Step, you probably shouldn't be
|
49
|
+
# using this directly, just RP::EMR::InstanceGroups instead
|
50
|
+
instance_group = RP::EMR::InstanceGroup.new(name: 'custom instance group') do |ig|
|
51
|
+
ig.instance_role = 'MASTER'
|
52
|
+
ig.instance_type = 'c1.medium'
|
53
|
+
ig.instance_count = 100
|
54
|
+
ig.market = 'SPOT'
|
55
|
+
ig.bid_price = 2.0
|
56
|
+
end
|
57
|
+
|
58
|
+
# Defines the different instances groups to be used. All the options for
|
59
|
+
# RP::EMR::InstanceGroup are supported, along with a defulat instance type
|
60
|
+
instance_groups = RP::EMR::InstanceGroups.new do |ig|
|
61
|
+
ig.default_instance_type = 'c1.medium'
|
62
|
+
|
63
|
+
ig.master_instance_type = 'c3.xlarge'
|
64
|
+
|
65
|
+
ig.core_instance_count = 5
|
66
|
+
|
67
|
+
ig.task_instance_count = 100
|
68
|
+
ig.task_instance_market = 'SPOT'
|
69
|
+
ig.task_bid_price = 2.0
|
70
|
+
end
|
71
|
+
|
72
|
+
# Top-level instance definition
|
73
|
+
instances = RP::EMR::Instances.new do |i|
|
74
|
+
i.instance_groups = instance_groups
|
75
|
+
i.ec2_key_name = 'my_key_name'
|
76
|
+
i.hadoop_version = '2.0'
|
77
|
+
end
|
78
|
+
|
79
|
+
# Now we can construct the actual job
|
80
|
+
job = RP::EMR::Job.new do |j|
|
81
|
+
j.instances = instances
|
82
|
+
j.steps = [step, pig_step]
|
83
|
+
j.ami_version = :latest
|
84
|
+
j.bootstrap_actions = [bootstrap_action]
|
85
|
+
j.visible_to_all_users = true
|
86
|
+
j.job_flow_role = 'MyIAMRole'
|
87
|
+
j.tags = ['analysis']
|
88
|
+
end
|
89
|
+
|
90
|
+
# Launch the job using the AWS API
|
91
|
+
AWS::EMR.new.job_flows.create('job_name', job.to_hash)
|
92
|
+
```
|
93
|
+
|
94
|
+
|
95
|
+
## Thor Helpers
|
96
|
+
|
97
|
+
The API wrapper is all fine and dandy, but it's still a pain to work with. So
|
98
|
+
there's a set of Thor helpers to make building jobs easier - they define things
|
99
|
+
like defaults, option parsing, and other goodness.
|
100
|
+
|
101
|
+
The gem installs an script called `emr` which provides basic options if you want
|
102
|
+
to build jobs interactively
|
103
|
+
|
104
|
+
```bash
|
105
|
+
bundle exec emr help
|
106
|
+
> Commands:
|
107
|
+
> emr add_pig_script_step JOB_ID SCRIPT_PATH # Add a Pig script step to an existing job
|
108
|
+
> emr add_rollup_step JOB_ID INPUT OUTPUT # Add a S3DistCp rollup step to an existing job
|
109
|
+
> emr add_setup_pig_step JOB_ID # Add a setup pig step to an existing job
|
110
|
+
> emr create_job JOB_NAME # Create an EMR job
|
111
|
+
> emr help [COMMAND] # Describe available commands or one specific command
|
112
|
+
>
|
113
|
+
> Options:
|
114
|
+
> -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
|
115
|
+
> -v, [--verbose], [--no-verbose] # Print lots of stuff
|
116
|
+
> [--dry-run], [--no-dry-run] # Don't actually talk to AWS
|
117
|
+
```
|
118
|
+
|
119
|
+
While these can be useful, the real goal is to make it easy to roll your own
|
120
|
+
CLI using these as building blocks. This is accomplished by providing class-level
|
121
|
+
helpers to import the options used for each step, allowing you to invoke them
|
122
|
+
as modular components.
|
123
|
+
|
124
|
+
For example:
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
#!/usr/bin/env ruby
|
128
|
+
|
129
|
+
require 'rp/emr'
|
130
|
+
require 'thor'
|
131
|
+
|
132
|
+
class ExampleCLI < Thor
|
133
|
+
# This brings all the class-level helpers in
|
134
|
+
extend RP::EMR::CLI::TaskOptions
|
135
|
+
|
136
|
+
# Creates shared options like --dry-run and --verbose
|
137
|
+
cli_class_options
|
138
|
+
|
139
|
+
# We're going to write a CLI for launching a pig script. The first thing
|
140
|
+
# we do is give it a name (this is standard Thor)
|
141
|
+
desc "pig", "Test a pig script"
|
142
|
+
|
143
|
+
# We'll need to launch a cluster to do our computation with. This method adds
|
144
|
+
# the options we'll use to create the cluster. Values passed to the method are
|
145
|
+
# used as the defaults
|
146
|
+
create_job_method_options(
|
147
|
+
default_instance_type: 'm1.large',
|
148
|
+
core_instance_count: 2,
|
149
|
+
task_instance_count: 6,
|
150
|
+
job_flow_role: 'MyIAMRole',
|
151
|
+
)
|
152
|
+
|
153
|
+
# Here we're importing the options used to control how Pig is setup
|
154
|
+
add_setup_pig_step_method_options
|
155
|
+
|
156
|
+
# And here were importing options used to create a Pig step generally
|
157
|
+
add_pig_script_step_method_options(
|
158
|
+
script_bucket: 'my-emr-scripts-bucket',
|
159
|
+
)
|
160
|
+
|
161
|
+
# Let's define some options specific to the task we're trying to complete
|
162
|
+
method_option :output, default: 'counted_words'
|
163
|
+
def pig
|
164
|
+
script_path = File.expand_path('../count_words.pig', __FILE__)
|
165
|
+
input_path = "s3://my-input-bucket/words"
|
166
|
+
output_path = "s3://my-output-bucket/#{options[:output]}/#{Time.now.to_i}"
|
167
|
+
|
168
|
+
# These will be available in our Pig script as '$INPUT' and '$OUTPUT'
|
169
|
+
pig_step_args = { pig_params: options[:pig_params].merge(
|
170
|
+
'INPUT' => input_path,
|
171
|
+
'OUTPUT' => output_path,
|
172
|
+
)}
|
173
|
+
|
174
|
+
# Now that we've constructed our options, we'll use the Thor task in lib/rp/emr/cli
|
175
|
+
# to create a job flow. The task returns the job identifier, and we're passing
|
176
|
+
# the options hash that Thor parsed for us (this is why we did all that setup
|
177
|
+
# earlier)
|
178
|
+
job_id = invoke 'emr:create_job', ['Word Count Job'], options
|
179
|
+
|
180
|
+
# The job has been created, so we'll add a step to setup pig
|
181
|
+
invoke 'emr:add_setup_pig_step', [job_id], options
|
182
|
+
|
183
|
+
# And finally we'll add our pig script. Notice that we're merging the pig
|
184
|
+
# args into the options hash. We could also have passed these options as CLI
|
185
|
+
# options - this lets us to complicated stuff like date coersions in Ruby
|
186
|
+
invoke 'emr:add_pig_script_step', [job_id, script_path], options.merge(pig_step_args)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
ExampleCLI.start
|
191
|
+
```
|
192
|
+
|
193
|
+
Now, we can get a nice help page describing all the options available to us
|
194
|
+
|
195
|
+
```bash
|
196
|
+
bundle exec ./word_count_cli --help
|
197
|
+
> Commands:
|
198
|
+
> word_count_cli help [COMMAND] # Describe available commands or one specific command
|
199
|
+
> work_count_cli pig # Test a pig script
|
200
|
+
>
|
201
|
+
> Options:
|
202
|
+
> -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
|
203
|
+
> -v, [--verbose], [--no-verbose] # Print lots of stuff
|
204
|
+
> [--dry-run], [--no-dry-run] # Don't actually talk to AWS
|
205
|
+
|
206
|
+
bundle exec ./word_count_cli help pig
|
207
|
+
> Usage:
|
208
|
+
> word_count_cli pig
|
209
|
+
>
|
210
|
+
> Options:
|
211
|
+
> -k, [--ec2-key-name=KEY_NAME] # An AWS keypair for the cluster. Useful if you want to shell into the cluster
|
212
|
+
> [--default-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster
|
213
|
+
> # Default: m1.large
|
214
|
+
> [--master-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster master group
|
215
|
+
> [--master-instance-count=N] # The number of task instances to create in the cluster master group
|
216
|
+
> [--core-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster core group
|
217
|
+
> [--core-instance-count=N] # The number of task instances to create in the cluster core group
|
218
|
+
> # Default: 2
|
219
|
+
> [--task-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster task group
|
220
|
+
> [--task-instance-count=N] # The number of task instances to create in the cluster task group
|
221
|
+
> # Default: 6
|
222
|
+
> [--task-bid-price=N.NN] # If set, will use spot instances for task trackers with this bid price
|
223
|
+
> [--job-flow-role=IAM_ROLE] # IAM Role for the job flow
|
224
|
+
> # Default: MyIAMRole
|
225
|
+
> [--script-bucket=BUCKET] # The S3 bucket to use for storing the Pig script
|
226
|
+
> # Default: my-emr-scripts-bucket
|
227
|
+
> -p, [--pig-params=PARAM:VALUE] # Parameters to be passed to the pig script
|
228
|
+
> [--output=OUTPUT]
|
229
|
+
> -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
|
230
|
+
> -v, [--verbose], [--no-verbose] # Print lots of stuff
|
231
|
+
> [--dry-run], [--no-dry-run] # Don't actually talk to AWS
|
232
|
+
|
233
|
+
bundle exec ./word_count_cli pig --ouput foo --dry-run
|
234
|
+
> -----------
|
235
|
+
> Created job flow job_flow_id with ["Word Count Job"], {"keep_alive"=>false, "verbose"=>false, "dry_run"=>true, ...}
|
236
|
+
> -----------
|
237
|
+
> Added setup pig step to job_flow_id with ["job_flow_id"], {"keep_alive"=>false, "verbose"=>false, "dry_run"=>true, ...}
|
238
|
+
> -----------
|
239
|
+
> Added pig script step to job_flow_id with ["job_flow_id", "count_words.pig"], {"keep_alive"=>false, ...}
|
240
|
+
```
|
data/Rakefile
ADDED
data/bin/emr
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
module RP
|
2
|
+
module EMR
|
3
|
+
# Bootstrap action wrapper
|
4
|
+
#
|
5
|
+
# @example
|
6
|
+
# def bootstrap_hadoop
|
7
|
+
# RP::EMR::BootstrapAction.new(
|
8
|
+
# name: 'Configure Hadoop',
|
9
|
+
# path: 's3://elasticmapreduce/bootstrap-actions/configure-hadoop',
|
10
|
+
# args: ['-c', 'fs.s3n.multipart.uploads.enabled=false']
|
11
|
+
# )
|
12
|
+
# end
|
13
|
+
#
|
14
|
+
# def bootstrap_daemons
|
15
|
+
# RP::EMR::BootstrapAction.new(
|
16
|
+
# name: 'Configure Daemons',
|
17
|
+
# path: 's3://elasticmapreduce/bootstrap-actions/configure-daemons',
|
18
|
+
# args: ['--namenode-heap-size=15000'],
|
19
|
+
# )
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
class BootstrapAction
|
23
|
+
extend Assembler
|
24
|
+
|
25
|
+
assemble_from :name, :path, args: []
|
26
|
+
|
27
|
+
def to_hash
|
28
|
+
{
|
29
|
+
name: name,
|
30
|
+
script_bootstrap_action: {
|
31
|
+
path: path,
|
32
|
+
args: args,
|
33
|
+
},
|
34
|
+
}
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/rp/emr/cli.rb
ADDED
@@ -0,0 +1,249 @@
|
|
1
|
+
module RP
|
2
|
+
module EMR
|
3
|
+
class CLI < Thor
|
4
|
+
module TaskOptions
|
5
|
+
def cli_class_options
|
6
|
+
class_option :keep_alive, aliases: '-a', default: false, type: :boolean, desc: 'Set to true if you want the cluster to stay alive after completion/failure'
|
7
|
+
class_option :verbose, aliases: '-v', default: false, type: :boolean, desc: 'Print lots of stuff'
|
8
|
+
class_option :dry_run, default: false, type: :boolean, desc: "Don't actually talk to AWS"
|
9
|
+
end
|
10
|
+
|
11
|
+
def create_job_method_options(defaults = {})
|
12
|
+
method_option(:ec2_key_name,
|
13
|
+
default: defaults[:ec2_key_name],
|
14
|
+
aliases: '-k',
|
15
|
+
banner: 'KEY_NAME',
|
16
|
+
desc: "An AWS keypair for the cluster. Useful if you want to shell into the cluster",
|
17
|
+
)
|
18
|
+
method_option(:default_instance_type,
|
19
|
+
default: defaults[:default_instance_type],
|
20
|
+
banner: 'INSTANCE_TYPE',
|
21
|
+
desc: "The EC2 instance type to use for the cluster",
|
22
|
+
)
|
23
|
+
method_option(:master_instance_type,
|
24
|
+
default: defaults[:master_instance_type],
|
25
|
+
banner: 'INSTANCE_TYPE',
|
26
|
+
desc: "The EC2 instance type to use for the cluster master group",
|
27
|
+
)
|
28
|
+
method_option(:master_instance_count,
|
29
|
+
default: defaults[:master_instance_count],
|
30
|
+
type: :numeric,
|
31
|
+
banner: 'N',
|
32
|
+
desc: "The number of task instances to create in the cluster master group",
|
33
|
+
)
|
34
|
+
method_option(:core_instance_type,
|
35
|
+
default: defaults[:core_instance_type],
|
36
|
+
banner: 'INSTANCE_TYPE',
|
37
|
+
desc: "The EC2 instance type to use for the cluster core group",
|
38
|
+
)
|
39
|
+
method_option(:core_instance_count,
|
40
|
+
default: defaults[:core_instance_count],
|
41
|
+
type: :numeric,
|
42
|
+
banner: 'N',
|
43
|
+
desc: "The number of task instances to create in the cluster core group",
|
44
|
+
)
|
45
|
+
method_option(:task_instance_type,
|
46
|
+
default: defaults[:task_instance_type],
|
47
|
+
banner: 'INSTANCE_TYPE',
|
48
|
+
desc: "The EC2 instance type to use for the cluster task group",
|
49
|
+
)
|
50
|
+
method_option(:task_instance_count,
|
51
|
+
default: defaults[:task_instance_count],
|
52
|
+
type: :numeric,
|
53
|
+
banner: 'N',
|
54
|
+
desc: "The number of task instances to create in the cluster task group",
|
55
|
+
)
|
56
|
+
method_option(:task_bid_price,
|
57
|
+
default: defaults[:task_bid_price],
|
58
|
+
type: :numeric,
|
59
|
+
banner: 'N.NN',
|
60
|
+
desc: "If set, will use spot instances for task trackers with this bid price",
|
61
|
+
)
|
62
|
+
method_option(:job_flow_role,
|
63
|
+
default: defaults[:job_flow_role],
|
64
|
+
banner: 'IAM_ROLE',
|
65
|
+
desc: "IAM Role for the job flow",
|
66
|
+
)
|
67
|
+
method_option(:service_role,
|
68
|
+
default: defaults[:service_role],
|
69
|
+
banner: 'IAM_ROLE',
|
70
|
+
desc: "IAM Role for the service",
|
71
|
+
)
|
72
|
+
end
|
73
|
+
|
74
|
+
def add_setup_pig_step_method_options(defaults = {})
|
75
|
+
method_option(:pig_version,
|
76
|
+
default: defaults[:pig_version] || '0.11.1.1',
|
77
|
+
desc: 'Version of Pig to install'
|
78
|
+
)
|
79
|
+
end
|
80
|
+
|
81
|
+
def add_setup_hive_step_method_options(defaults = {})
|
82
|
+
method_option(:hive_version,
|
83
|
+
default: defaults[:hive_version] || 'latest',
|
84
|
+
desc: 'Version of Hive to install'
|
85
|
+
)
|
86
|
+
end
|
87
|
+
|
88
|
+
def add_rollup_step_method_options(defaults = {})
|
89
|
+
method_option(:rollup_input_pattern,
|
90
|
+
default: defaults[:rollup_input_pattern],
|
91
|
+
desc: 'Java-compatable regex to filter input',
|
92
|
+
)
|
93
|
+
method_option(:rollup_group_by,
|
94
|
+
default: defaults[:rollup_group_by],
|
95
|
+
desc: 'Java-compatable regex with a single capture group',
|
96
|
+
)
|
97
|
+
method_option(:rollup_target_size,
|
98
|
+
default: defaults[:rollup_target_size],
|
99
|
+
type: :numeric,
|
100
|
+
desc: 'The target file size for rolled up files',
|
101
|
+
)
|
102
|
+
end
|
103
|
+
|
104
|
+
def add_pig_script_step_method_options(defaults = {})
|
105
|
+
method_option(:script_bucket,
|
106
|
+
default: defaults[:script_bucket],
|
107
|
+
banner: 'BUCKET',
|
108
|
+
desc: 'The S3 bucket to use for storing the Pig script',
|
109
|
+
)
|
110
|
+
method_option(:pig_params,
|
111
|
+
default: defaults[:pig_params] || {},
|
112
|
+
aliases: '-p',
|
113
|
+
type: :hash,
|
114
|
+
banner: 'PARAM:VALUE',
|
115
|
+
desc: 'Parameters to be passed to the pig script',
|
116
|
+
)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
extend TaskOptions
|
121
|
+
|
122
|
+
namespace :emr
|
123
|
+
|
124
|
+
cli_class_options
|
125
|
+
|
126
|
+
desc "create_job JOB_NAME", "Create an EMR job"
|
127
|
+
create_job_method_options
|
128
|
+
def create_job(job_name, *)
|
129
|
+
instances = RP::EMR::Instances.new do |i|
|
130
|
+
i.hadoop_version = '2.2.0'
|
131
|
+
i.ec2_key_name = options[:ec2_key_name] if options[:ec2_key_name]
|
132
|
+
i.keep_job_flow_alive_when_no_steps = options[:keep_alive]
|
133
|
+
|
134
|
+
i.instance_groups = RP::EMR::InstanceGroups.new do |ig|
|
135
|
+
ig.default_instance_type = options[:default_instance_type] if options[:default_instance_type]
|
136
|
+
|
137
|
+
ig.master_instance_type = options[:master_instance_type] if options[:master_instance_type]
|
138
|
+
ig.master_instance_count = options[:master_instance_count] if options[:master_instance_count]
|
139
|
+
|
140
|
+
ig.core_instance_type = options[:core_instance_type] if options[:core_instance_type]
|
141
|
+
ig.core_instance_count = options[:core_instance_count] if options[:core_instance_count]
|
142
|
+
|
143
|
+
ig.task_instance_type = options[:task_instance_type] if options[:task_instance_type]
|
144
|
+
ig.task_instance_count = options[:task_instance_count] if options[:task_instance_count]
|
145
|
+
ig.task_bid_price = options[:task_bid_price] if options[:task_bid_price]
|
146
|
+
end.to_a
|
147
|
+
end
|
148
|
+
|
149
|
+
setup_debugging_step = RP::EMR::Step::SetupDebugging.new do |s|
|
150
|
+
s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
|
151
|
+
end
|
152
|
+
|
153
|
+
job = RP::EMR::Job.new do |job|
|
154
|
+
job.log_uri = "s3://oib-mapreduce/logs/mosaic_analysis/#{job_name.underscore}"
|
155
|
+
job.instances = instances.to_hash
|
156
|
+
job.steps = [setup_debugging_step.to_hash]
|
157
|
+
job.job_flow_role = options[:job_flow_role] if options[:job_flow_role]
|
158
|
+
job.service_role = options[:service_role] if options[:service_role]
|
159
|
+
end
|
160
|
+
|
161
|
+
if options[:dry_run]
|
162
|
+
job_flow = OpenStruct.new(id: 'job_flow_id')
|
163
|
+
else
|
164
|
+
job_flow = AWS::EMR.new.job_flows.create(job_name, job.to_hash)
|
165
|
+
end
|
166
|
+
puts '-----------'
|
167
|
+
puts "Created job flow #{job_flow.id} with #{args}, #{options}"
|
168
|
+
pp job.to_hash if options[:verbose]
|
169
|
+
|
170
|
+
return job_flow.id
|
171
|
+
end
|
172
|
+
|
173
|
+
desc "add_setup_pig_step JOB_ID", "Add a setup pig step to an existing job"
|
174
|
+
add_setup_pig_step_method_options
|
175
|
+
def add_setup_pig_step(job_id, *)
|
176
|
+
job = AWS::EMR.new.job_flows[job_id]
|
177
|
+
|
178
|
+
step = RP::EMR::Step::SetupPig.new do |s|
|
179
|
+
s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
|
180
|
+
s.pig_version = options[:pig_version] if options[:pig_version]
|
181
|
+
end
|
182
|
+
|
183
|
+
job.add_steps([step.to_hash]) unless options[:dry_run]
|
184
|
+
puts '-----------'
|
185
|
+
puts "Added setup pig step to #{job.id} with #{args}, #{options}"
|
186
|
+
pp step.to_hash if options[:verbose]
|
187
|
+
end
|
188
|
+
|
189
|
+
desc "add_setup_hive_step JOB_ID", "Add a setup hive step to an existing job"
|
190
|
+
add_setup_hive_step_method_options
|
191
|
+
def add_setup_hive_step(job_id, *)
|
192
|
+
job = AWS::EMR.new.job_flows[job_id]
|
193
|
+
|
194
|
+
step = RP::EMR::Step::SetupHive.new do |s|
|
195
|
+
s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
|
196
|
+
s.hive_version = options[:hive_version] if options[:hive_version]
|
197
|
+
end
|
198
|
+
|
199
|
+
job.add_steps([step.to_hash]) unless options[:dry_run]
|
200
|
+
puts '-----------'
|
201
|
+
puts "Added setup hive step to #{job.id} with #{args}, #{options}"
|
202
|
+
pp step.to_hash if options[:verbose]
|
203
|
+
end
|
204
|
+
|
205
|
+
desc "add_rollup_step JOB_ID INPUT OUTPUT", "Add a S3DistCp rollup step to an existing job"
|
206
|
+
add_rollup_step_method_options
|
207
|
+
def add_rollup_step(job_id, input, output, *)
|
208
|
+
job = AWS::EMR.new.job_flows[job_id]
|
209
|
+
|
210
|
+
step = RP::EMR::Step::S3DistCp.new(
|
211
|
+
name: 'Rollup',
|
212
|
+
src: input,
|
213
|
+
dest: output,
|
214
|
+
) do |s|
|
215
|
+
s.srcPattern = options[:rollup_input_pattern] if options[:rollup_input_pattern]
|
216
|
+
s.groupBy = options[:rollup_group_by] if options[:rollup_group_by]
|
217
|
+
s.targetSize = options[:rollup_target_size] if options[:rollup_target_size]
|
218
|
+
s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
|
219
|
+
end
|
220
|
+
|
221
|
+
job.add_steps([step.to_hash]) unless options[:dry_run]
|
222
|
+
puts '-----------'
|
223
|
+
puts "Added rollup step to #{job.id} with #{args}, #{options}"
|
224
|
+
pp step.to_hash if options[:verbose]
|
225
|
+
end
|
226
|
+
|
227
|
+
desc "add_pig_script_step JOB_ID SCRIPT_PATH", "Add a Pig script step to an existing job"
|
228
|
+
add_pig_script_step_method_options
|
229
|
+
def add_pig_script_step(job_id, script_path, *)
|
230
|
+
job = AWS::EMR.new.job_flows[job_id]
|
231
|
+
|
232
|
+
step = RP::EMR::Step::Pig.new(
|
233
|
+
name: 'Pig',
|
234
|
+
script_path: script_path,
|
235
|
+
script_bucket: options[:script_bucket],
|
236
|
+
) do |s|
|
237
|
+
s.pig_params = options[:pig_params] if options[:pig_params]
|
238
|
+
s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
|
239
|
+
s.dry_run = options[:dry_run]
|
240
|
+
end
|
241
|
+
|
242
|
+
job.add_steps([step.to_hash]) unless options[:dry_run]
|
243
|
+
puts '-----------'
|
244
|
+
puts "Added pig script step to #{job.id} with #{args}, #{options}"
|
245
|
+
pp step.to_hash if options[:verbose]
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module RP
|
2
|
+
module EMR
|
3
|
+
class InstanceGroup
|
4
|
+
extend Assembler
|
5
|
+
|
6
|
+
assemble_from(
|
7
|
+
# Required params
|
8
|
+
:instance_role,
|
9
|
+
:instance_type,
|
10
|
+
:instance_count,
|
11
|
+
|
12
|
+
# Optional params
|
13
|
+
name: nil,
|
14
|
+
market: nil,
|
15
|
+
bid_price: nil,
|
16
|
+
)
|
17
|
+
|
18
|
+
def to_hash
|
19
|
+
{
|
20
|
+
name: name,
|
21
|
+
market: market,
|
22
|
+
instance_role: instance_role,
|
23
|
+
bid_price: bid_price.to_s,
|
24
|
+
instance_type: instance_type,
|
25
|
+
instance_count: instance_count,
|
26
|
+
}.reject { |k,v| !v || (v.respond_to?(:empty?) && v.empty?) }
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def market
|
32
|
+
bid_price ? 'SPOT' : @market
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|