rp-emr 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 608a51bf27824ca9856bbdc548a0d810b0e5af65
4
+ data.tar.gz: 0f4d6fe64878316614c9bccafab5cee6e553bbb8
5
+ SHA512:
6
+ metadata.gz: b6d2180e2f2bc109ffca80e3f531297dad50f94f7dfef79ce8bc21ac2010162e09fa2d7d87691031cf00784de4c93d71704822aec102800edce5d36c73609b1c
7
+ data.tar.gz: 213a4f7215cf3848cef5986a3c727e131f6494c32806b336cbf1c134b89b0330688e6ea4997be8783faa6a652d0a870dac105f30ff55711e5b2edef33e175e33
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ .bundle
2
+ .idea
3
+ pkg/
4
+ Gemfile.lock
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/CHANGELOG.md ADDED
@@ -0,0 +1,18 @@
1
+ # 1.0.3
2
+
3
+ * Implement SetupHive step
4
+
5
+ # 1.0.2
6
+
7
+ * Add service_role option when launching a Job.
8
+ * BUG: Fix job_flow_role option to be properly set when passed to Job in CLI launcher.
9
+
10
+ # 1.0.1
11
+
12
+ * `add_setup_pig_step_method_options` now allows `pig_version` to be configured
13
+ * `S3DistCp` now allows for the `s3_distcp_jar` to be configured
14
+
15
+ # 1.0.0
16
+
17
+ * 1 major enhancement
18
+ * Birthday!
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,23 @@
1
+ Copyright (c) 2013 ReturnPath, Inc.
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+
data/README.md ADDED
@@ -0,0 +1,240 @@
1
+ # ReturnPath EMR Tools
2
+
3
+ This is a Ruby library for creating & launching jobs on AWS's Elastic MapReduce
4
+ service. The library provides two basic tools: a set of classes to encapsulate
5
+ the data structures expected by the EMR client, and a set of Thor helpers to
6
+ simplify building job launchers.
7
+
8
+ ## Client Wrapper
9
+
10
+ The AWS EMR client is very low level, and basically expects a hash of values.
11
+ rp-emr provides wrappers for the basic data types and some helpers for building
12
+ collections. All objects are built using the
13
+ [assembler](https://github.com/benhamill/assembler) gem, so you can mix values
14
+ between method-call syntax and builder-block syntax.
15
+
16
+ The basic bits look like this:
17
+
18
+ ```ruby
19
+ # Executes a script before the cluster starts processing steps
20
+ bootstrap_action = RP::EMR::BootstrapAction.new(name: 'action name') do |a|
21
+ a.path = 's3://path_to_script_to_run'
22
+ a.args = ['--option value', '--other-option value']
23
+ end
24
+
25
+ # Runs a hadoop jar. This is the bare-bones version, you'll probably want to
26
+ # use one of the classes in lib/rp/emr/step
27
+ step = RP::EMR::Step.new(name: 'step name') do |s|
28
+ s.action_on_failure = 'CANCEL_AND_WAIT'
29
+ s.hadoop_jar_step = {
30
+ jar: 's3://path_to_jar',
31
+ args: ['--option value', '--other-option value'],
32
+ }
33
+ end
34
+
35
+ # Runs a pig script
36
+ pig_step = RP::EMR::Step::Pig.new(name: 'pig step') do |s|
37
+ s.script_path = '/local/path/to/pig_script.pig'
38
+ s.script_bucket = 'bucket_to_upload_script_to'
39
+ s.args = ['--args_to_append_to_job']
40
+ s.pig_params = {'PIG_PARAM' => 'value'}
41
+ s.pig_version = '0.11.1.1'
42
+ s.action_on_failure = 'CANCEL_AND_WAIT'
43
+ s.dry_run = false
44
+ end
45
+
46
+ # There are also steps for setting up pig, setting up debugging, using S3DistCP, etc
47
+
48
+ # Creates an instance group. As with RP::EMR::Step, you probably shouldn't be
49
+ # using this directly, just RP::EMR::InstanceGroups instead
50
+ instance_group = RP::EMR::InstanceGroup.new(name: 'custom instance group') do |ig|
51
+ ig.instance_role = 'MASTER'
52
+ ig.instance_type = 'c1.medium'
53
+ ig.instance_count = 100
54
+ ig.market = 'SPOT'
55
+ ig.bid_price = 2.0
56
+ end
57
+
58
+ # Defines the different instances groups to be used. All the options for
59
+ # RP::EMR::InstanceGroup are supported, along with a defulat instance type
60
+ instance_groups = RP::EMR::InstanceGroups.new do |ig|
61
+ ig.default_instance_type = 'c1.medium'
62
+
63
+ ig.master_instance_type = 'c3.xlarge'
64
+
65
+ ig.core_instance_count = 5
66
+
67
+ ig.task_instance_count = 100
68
+ ig.task_instance_market = 'SPOT'
69
+ ig.task_bid_price = 2.0
70
+ end
71
+
72
+ # Top-level instance definition
73
+ instances = RP::EMR::Instances.new do |i|
74
+ i.instance_groups = instance_groups
75
+ i.ec2_key_name = 'my_key_name'
76
+ i.hadoop_version = '2.0'
77
+ end
78
+
79
+ # Now we can construct the actual job
80
+ job = RP::EMR::Job.new do |j|
81
+ j.instances = instances
82
+ j.steps = [step, pig_step]
83
+ j.ami_version = :latest
84
+ j.bootstrap_actions = [bootstrap_action]
85
+ j.visible_to_all_users = true
86
+ j.job_flow_role = 'MyIAMRole'
87
+ j.tags = ['analysis']
88
+ end
89
+
90
+ # Launch the job using the AWS API
91
+ AWS::EMR.new.job_flows.create('job_name', job.to_hash)
92
+ ```
93
+
94
+
95
+ ## Thor Helpers
96
+
97
+ The API wrapper is all fine and dandy, but it's still a pain to work with. So
98
+ there's a set of Thor helpers to make building jobs easier - they define things
99
+ like defaults, option parsing, and other goodness.
100
+
101
+ The gem installs an script called `emr` which provides basic options if you want
102
+ to build jobs interactively
103
+
104
+ ```bash
105
+ bundle exec emr help
106
+ > Commands:
107
+ > emr add_pig_script_step JOB_ID SCRIPT_PATH # Add a Pig script step to an existing job
108
+ > emr add_rollup_step JOB_ID INPUT OUTPUT # Add a S3DistCp rollup step to an existing job
109
+ > emr add_setup_pig_step JOB_ID # Add a setup pig step to an existing job
110
+ > emr create_job JOB_NAME # Create an EMR job
111
+ > emr help [COMMAND] # Describe available commands or one specific command
112
+ >
113
+ > Options:
114
+ > -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
115
+ > -v, [--verbose], [--no-verbose] # Print lots of stuff
116
+ > [--dry-run], [--no-dry-run] # Don't actually talk to AWS
117
+ ```
118
+
119
+ While these can be useful, the real goal is to make it easy to roll your own
120
+ CLI using these as building blocks. This is accomplished by providing class-level
121
+ helpers to import the options used for each step, allowing you to invoke them
122
+ as modular components.
123
+
124
+ For example:
125
+
126
+ ```ruby
127
+ #!/usr/bin/env ruby
128
+
129
+ require 'rp/emr'
130
+ require 'thor'
131
+
132
+ class ExampleCLI < Thor
133
+ # This brings all the class-level helpers in
134
+ extend RP::EMR::CLI::TaskOptions
135
+
136
+ # Creates shared options like --dry-run and --verbose
137
+ cli_class_options
138
+
139
+ # We're going to write a CLI for launching a pig script. The first thing
140
+ # we do is give it a name (this is standard Thor)
141
+ desc "pig", "Test a pig script"
142
+
143
+ # We'll need to launch a cluster to do our computation with. This method adds
144
+ # the options we'll use to create the cluster. Values passed to the method are
145
+ # used as the defaults
146
+ create_job_method_options(
147
+ default_instance_type: 'm1.large',
148
+ core_instance_count: 2,
149
+ task_instance_count: 6,
150
+ job_flow_role: 'MyIAMRole',
151
+ )
152
+
153
+ # Here we're importing the options used to control how Pig is setup
154
+ add_setup_pig_step_method_options
155
+
156
+ # And here were importing options used to create a Pig step generally
157
+ add_pig_script_step_method_options(
158
+ script_bucket: 'my-emr-scripts-bucket',
159
+ )
160
+
161
+ # Let's define some options specific to the task we're trying to complete
162
+ method_option :output, default: 'counted_words'
163
+ def pig
164
+ script_path = File.expand_path('../count_words.pig', __FILE__)
165
+ input_path = "s3://my-input-bucket/words"
166
+ output_path = "s3://my-output-bucket/#{options[:output]}/#{Time.now.to_i}"
167
+
168
+ # These will be available in our Pig script as '$INPUT' and '$OUTPUT'
169
+ pig_step_args = { pig_params: options[:pig_params].merge(
170
+ 'INPUT' => input_path,
171
+ 'OUTPUT' => output_path,
172
+ )}
173
+
174
+ # Now that we've constructed our options, we'll use the Thor task in lib/rp/emr/cli
175
+ # to create a job flow. The task returns the job identifier, and we're passing
176
+ # the options hash that Thor parsed for us (this is why we did all that setup
177
+ # earlier)
178
+ job_id = invoke 'emr:create_job', ['Word Count Job'], options
179
+
180
+ # The job has been created, so we'll add a step to setup pig
181
+ invoke 'emr:add_setup_pig_step', [job_id], options
182
+
183
+ # And finally we'll add our pig script. Notice that we're merging the pig
184
+ # args into the options hash. We could also have passed these options as CLI
185
+ # options - this lets us to complicated stuff like date coersions in Ruby
186
+ invoke 'emr:add_pig_script_step', [job_id, script_path], options.merge(pig_step_args)
187
+ end
188
+ end
189
+
190
+ ExampleCLI.start
191
+ ```
192
+
193
+ Now, we can get a nice help page describing all the options available to us
194
+
195
+ ```bash
196
+ bundle exec ./word_count_cli --help
197
+ > Commands:
198
+ > word_count_cli help [COMMAND] # Describe available commands or one specific command
199
+ > work_count_cli pig # Test a pig script
200
+ >
201
+ > Options:
202
+ > -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
203
+ > -v, [--verbose], [--no-verbose] # Print lots of stuff
204
+ > [--dry-run], [--no-dry-run] # Don't actually talk to AWS
205
+
206
+ bundle exec ./word_count_cli help pig
207
+ > Usage:
208
+ > word_count_cli pig
209
+ >
210
+ > Options:
211
+ > -k, [--ec2-key-name=KEY_NAME] # An AWS keypair for the cluster. Useful if you want to shell into the cluster
212
+ > [--default-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster
213
+ > # Default: m1.large
214
+ > [--master-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster master group
215
+ > [--master-instance-count=N] # The number of task instances to create in the cluster master group
216
+ > [--core-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster core group
217
+ > [--core-instance-count=N] # The number of task instances to create in the cluster core group
218
+ > # Default: 2
219
+ > [--task-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster task group
220
+ > [--task-instance-count=N] # The number of task instances to create in the cluster task group
221
+ > # Default: 6
222
+ > [--task-bid-price=N.NN] # If set, will use spot instances for task trackers with this bid price
223
+ > [--job-flow-role=IAM_ROLE] # IAM Role for the job flow
224
+ > # Default: MyIAMRole
225
+ > [--script-bucket=BUCKET] # The S3 bucket to use for storing the Pig script
226
+ > # Default: my-emr-scripts-bucket
227
+ > -p, [--pig-params=PARAM:VALUE] # Parameters to be passed to the pig script
228
+ > [--output=OUTPUT]
229
+ > -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
230
+ > -v, [--verbose], [--no-verbose] # Print lots of stuff
231
+ > [--dry-run], [--no-dry-run] # Don't actually talk to AWS
232
+
233
+ bundle exec ./word_count_cli pig --ouput foo --dry-run
234
+ > -----------
235
+ > Created job flow job_flow_id with ["Word Count Job"], {"keep_alive"=>false, "verbose"=>false, "dry_run"=>true, ...}
236
+ > -----------
237
+ > Added setup pig step to job_flow_id with ["job_flow_id"], {"keep_alive"=>false, "verbose"=>false, "dry_run"=>true, ...}
238
+ > -----------
239
+ > Added pig script step to job_flow_id with ["job_flow_id", "count_words.pig"], {"keep_alive"=>false, ...}
240
+ ```
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rp/emr'
4
+
5
+ RSpec::Core::RakeTask.new(:spec)
6
+ task :default => :spec
data/bin/emr ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/rp/emr', __FILE__)
4
+
5
+ RP::EMR::CLI.start
@@ -0,0 +1,38 @@
1
+ module RP
2
+ module EMR
3
+ # Bootstrap action wrapper
4
+ #
5
+ # @example
6
+ # def bootstrap_hadoop
7
+ # RP::EMR::BootstrapAction.new(
8
+ # name: 'Configure Hadoop',
9
+ # path: 's3://elasticmapreduce/bootstrap-actions/configure-hadoop',
10
+ # args: ['-c', 'fs.s3n.multipart.uploads.enabled=false']
11
+ # )
12
+ # end
13
+ #
14
+ # def bootstrap_daemons
15
+ # RP::EMR::BootstrapAction.new(
16
+ # name: 'Configure Daemons',
17
+ # path: 's3://elasticmapreduce/bootstrap-actions/configure-daemons',
18
+ # args: ['--namenode-heap-size=15000'],
19
+ # )
20
+ # end
21
+ #
22
+ class BootstrapAction
23
+ extend Assembler
24
+
25
+ assemble_from :name, :path, args: []
26
+
27
+ def to_hash
28
+ {
29
+ name: name,
30
+ script_bootstrap_action: {
31
+ path: path,
32
+ args: args,
33
+ },
34
+ }
35
+ end
36
+ end
37
+ end
38
+ end
data/lib/rp/emr/cli.rb ADDED
@@ -0,0 +1,249 @@
1
+ module RP
2
+ module EMR
3
+ class CLI < Thor
4
+ module TaskOptions
5
+ def cli_class_options
6
+ class_option :keep_alive, aliases: '-a', default: false, type: :boolean, desc: 'Set to true if you want the cluster to stay alive after completion/failure'
7
+ class_option :verbose, aliases: '-v', default: false, type: :boolean, desc: 'Print lots of stuff'
8
+ class_option :dry_run, default: false, type: :boolean, desc: "Don't actually talk to AWS"
9
+ end
10
+
11
+ def create_job_method_options(defaults = {})
12
+ method_option(:ec2_key_name,
13
+ default: defaults[:ec2_key_name],
14
+ aliases: '-k',
15
+ banner: 'KEY_NAME',
16
+ desc: "An AWS keypair for the cluster. Useful if you want to shell into the cluster",
17
+ )
18
+ method_option(:default_instance_type,
19
+ default: defaults[:default_instance_type],
20
+ banner: 'INSTANCE_TYPE',
21
+ desc: "The EC2 instance type to use for the cluster",
22
+ )
23
+ method_option(:master_instance_type,
24
+ default: defaults[:master_instance_type],
25
+ banner: 'INSTANCE_TYPE',
26
+ desc: "The EC2 instance type to use for the cluster master group",
27
+ )
28
+ method_option(:master_instance_count,
29
+ default: defaults[:master_instance_count],
30
+ type: :numeric,
31
+ banner: 'N',
32
+ desc: "The number of task instances to create in the cluster master group",
33
+ )
34
+ method_option(:core_instance_type,
35
+ default: defaults[:core_instance_type],
36
+ banner: 'INSTANCE_TYPE',
37
+ desc: "The EC2 instance type to use for the cluster core group",
38
+ )
39
+ method_option(:core_instance_count,
40
+ default: defaults[:core_instance_count],
41
+ type: :numeric,
42
+ banner: 'N',
43
+ desc: "The number of task instances to create in the cluster core group",
44
+ )
45
+ method_option(:task_instance_type,
46
+ default: defaults[:task_instance_type],
47
+ banner: 'INSTANCE_TYPE',
48
+ desc: "The EC2 instance type to use for the cluster task group",
49
+ )
50
+ method_option(:task_instance_count,
51
+ default: defaults[:task_instance_count],
52
+ type: :numeric,
53
+ banner: 'N',
54
+ desc: "The number of task instances to create in the cluster task group",
55
+ )
56
+ method_option(:task_bid_price,
57
+ default: defaults[:task_bid_price],
58
+ type: :numeric,
59
+ banner: 'N.NN',
60
+ desc: "If set, will use spot instances for task trackers with this bid price",
61
+ )
62
+ method_option(:job_flow_role,
63
+ default: defaults[:job_flow_role],
64
+ banner: 'IAM_ROLE',
65
+ desc: "IAM Role for the job flow",
66
+ )
67
+ method_option(:service_role,
68
+ default: defaults[:service_role],
69
+ banner: 'IAM_ROLE',
70
+ desc: "IAM Role for the service",
71
+ )
72
+ end
73
+
74
+ def add_setup_pig_step_method_options(defaults = {})
75
+ method_option(:pig_version,
76
+ default: defaults[:pig_version] || '0.11.1.1',
77
+ desc: 'Version of Pig to install'
78
+ )
79
+ end
80
+
81
+ def add_setup_hive_step_method_options(defaults = {})
82
+ method_option(:hive_version,
83
+ default: defaults[:hive_version] || 'latest',
84
+ desc: 'Version of Hive to install'
85
+ )
86
+ end
87
+
88
+ def add_rollup_step_method_options(defaults = {})
89
+ method_option(:rollup_input_pattern,
90
+ default: defaults[:rollup_input_pattern],
91
+ desc: 'Java-compatable regex to filter input',
92
+ )
93
+ method_option(:rollup_group_by,
94
+ default: defaults[:rollup_group_by],
95
+ desc: 'Java-compatable regex with a single capture group',
96
+ )
97
+ method_option(:rollup_target_size,
98
+ default: defaults[:rollup_target_size],
99
+ type: :numeric,
100
+ desc: 'The target file size for rolled up files',
101
+ )
102
+ end
103
+
104
+ def add_pig_script_step_method_options(defaults = {})
105
+ method_option(:script_bucket,
106
+ default: defaults[:script_bucket],
107
+ banner: 'BUCKET',
108
+ desc: 'The S3 bucket to use for storing the Pig script',
109
+ )
110
+ method_option(:pig_params,
111
+ default: defaults[:pig_params] || {},
112
+ aliases: '-p',
113
+ type: :hash,
114
+ banner: 'PARAM:VALUE',
115
+ desc: 'Parameters to be passed to the pig script',
116
+ )
117
+ end
118
+ end
119
+
120
+ extend TaskOptions
121
+
122
+ namespace :emr
123
+
124
+ cli_class_options
125
+
126
+ desc "create_job JOB_NAME", "Create an EMR job"
127
+ create_job_method_options
128
+ def create_job(job_name, *)
129
+ instances = RP::EMR::Instances.new do |i|
130
+ i.hadoop_version = '2.2.0'
131
+ i.ec2_key_name = options[:ec2_key_name] if options[:ec2_key_name]
132
+ i.keep_job_flow_alive_when_no_steps = options[:keep_alive]
133
+
134
+ i.instance_groups = RP::EMR::InstanceGroups.new do |ig|
135
+ ig.default_instance_type = options[:default_instance_type] if options[:default_instance_type]
136
+
137
+ ig.master_instance_type = options[:master_instance_type] if options[:master_instance_type]
138
+ ig.master_instance_count = options[:master_instance_count] if options[:master_instance_count]
139
+
140
+ ig.core_instance_type = options[:core_instance_type] if options[:core_instance_type]
141
+ ig.core_instance_count = options[:core_instance_count] if options[:core_instance_count]
142
+
143
+ ig.task_instance_type = options[:task_instance_type] if options[:task_instance_type]
144
+ ig.task_instance_count = options[:task_instance_count] if options[:task_instance_count]
145
+ ig.task_bid_price = options[:task_bid_price] if options[:task_bid_price]
146
+ end.to_a
147
+ end
148
+
149
+ setup_debugging_step = RP::EMR::Step::SetupDebugging.new do |s|
150
+ s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
151
+ end
152
+
153
+ job = RP::EMR::Job.new do |job|
154
+ job.log_uri = "s3://oib-mapreduce/logs/mosaic_analysis/#{job_name.underscore}"
155
+ job.instances = instances.to_hash
156
+ job.steps = [setup_debugging_step.to_hash]
157
+ job.job_flow_role = options[:job_flow_role] if options[:job_flow_role]
158
+ job.service_role = options[:service_role] if options[:service_role]
159
+ end
160
+
161
+ if options[:dry_run]
162
+ job_flow = OpenStruct.new(id: 'job_flow_id')
163
+ else
164
+ job_flow = AWS::EMR.new.job_flows.create(job_name, job.to_hash)
165
+ end
166
+ puts '-----------'
167
+ puts "Created job flow #{job_flow.id} with #{args}, #{options}"
168
+ pp job.to_hash if options[:verbose]
169
+
170
+ return job_flow.id
171
+ end
172
+
173
+ desc "add_setup_pig_step JOB_ID", "Add a setup pig step to an existing job"
174
+ add_setup_pig_step_method_options
175
+ def add_setup_pig_step(job_id, *)
176
+ job = AWS::EMR.new.job_flows[job_id]
177
+
178
+ step = RP::EMR::Step::SetupPig.new do |s|
179
+ s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
180
+ s.pig_version = options[:pig_version] if options[:pig_version]
181
+ end
182
+
183
+ job.add_steps([step.to_hash]) unless options[:dry_run]
184
+ puts '-----------'
185
+ puts "Added setup pig step to #{job.id} with #{args}, #{options}"
186
+ pp step.to_hash if options[:verbose]
187
+ end
188
+
189
+ desc "add_setup_hive_step JOB_ID", "Add a setup hive step to an existing job"
190
+ add_setup_hive_step_method_options
191
+ def add_setup_hive_step(job_id, *)
192
+ job = AWS::EMR.new.job_flows[job_id]
193
+
194
+ step = RP::EMR::Step::SetupHive.new do |s|
195
+ s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
196
+ s.hive_version = options[:hive_version] if options[:hive_version]
197
+ end
198
+
199
+ job.add_steps([step.to_hash]) unless options[:dry_run]
200
+ puts '-----------'
201
+ puts "Added setup hive step to #{job.id} with #{args}, #{options}"
202
+ pp step.to_hash if options[:verbose]
203
+ end
204
+
205
+ desc "add_rollup_step JOB_ID INPUT OUTPUT", "Add a S3DistCp rollup step to an existing job"
206
+ add_rollup_step_method_options
207
+ def add_rollup_step(job_id, input, output, *)
208
+ job = AWS::EMR.new.job_flows[job_id]
209
+
210
+ step = RP::EMR::Step::S3DistCp.new(
211
+ name: 'Rollup',
212
+ src: input,
213
+ dest: output,
214
+ ) do |s|
215
+ s.srcPattern = options[:rollup_input_pattern] if options[:rollup_input_pattern]
216
+ s.groupBy = options[:rollup_group_by] if options[:rollup_group_by]
217
+ s.targetSize = options[:rollup_target_size] if options[:rollup_target_size]
218
+ s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
219
+ end
220
+
221
+ job.add_steps([step.to_hash]) unless options[:dry_run]
222
+ puts '-----------'
223
+ puts "Added rollup step to #{job.id} with #{args}, #{options}"
224
+ pp step.to_hash if options[:verbose]
225
+ end
226
+
227
+ desc "add_pig_script_step JOB_ID SCRIPT_PATH", "Add a Pig script step to an existing job"
228
+ add_pig_script_step_method_options
229
+ def add_pig_script_step(job_id, script_path, *)
230
+ job = AWS::EMR.new.job_flows[job_id]
231
+
232
+ step = RP::EMR::Step::Pig.new(
233
+ name: 'Pig',
234
+ script_path: script_path,
235
+ script_bucket: options[:script_bucket],
236
+ ) do |s|
237
+ s.pig_params = options[:pig_params] if options[:pig_params]
238
+ s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
239
+ s.dry_run = options[:dry_run]
240
+ end
241
+
242
+ job.add_steps([step.to_hash]) unless options[:dry_run]
243
+ puts '-----------'
244
+ puts "Added pig script step to #{job.id} with #{args}, #{options}"
245
+ pp step.to_hash if options[:verbose]
246
+ end
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,36 @@
1
+ module RP
2
+ module EMR
3
+ class InstanceGroup
4
+ extend Assembler
5
+
6
+ assemble_from(
7
+ # Required params
8
+ :instance_role,
9
+ :instance_type,
10
+ :instance_count,
11
+
12
+ # Optional params
13
+ name: nil,
14
+ market: nil,
15
+ bid_price: nil,
16
+ )
17
+
18
+ def to_hash
19
+ {
20
+ name: name,
21
+ market: market,
22
+ instance_role: instance_role,
23
+ bid_price: bid_price.to_s,
24
+ instance_type: instance_type,
25
+ instance_count: instance_count,
26
+ }.reject { |k,v| !v || (v.respond_to?(:empty?) && v.empty?) }
27
+ end
28
+
29
+ private
30
+
31
+ def market
32
+ bid_price ? 'SPOT' : @market
33
+ end
34
+ end
35
+ end
36
+ end