rp-emr 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 608a51bf27824ca9856bbdc548a0d810b0e5af65
4
+ data.tar.gz: 0f4d6fe64878316614c9bccafab5cee6e553bbb8
5
+ SHA512:
6
+ metadata.gz: b6d2180e2f2bc109ffca80e3f531297dad50f94f7dfef79ce8bc21ac2010162e09fa2d7d87691031cf00784de4c93d71704822aec102800edce5d36c73609b1c
7
+ data.tar.gz: 213a4f7215cf3848cef5986a3c727e131f6494c32806b336cbf1c134b89b0330688e6ea4997be8783faa6a652d0a870dac105f30ff55711e5b2edef33e175e33
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ .bundle
2
+ .idea
3
+ pkg/
4
+ Gemfile.lock
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/CHANGELOG.md ADDED
@@ -0,0 +1,18 @@
1
+ # 1.0.3
2
+
3
+ * Implement SetupHive step
4
+
5
+ # 1.0.2
6
+
7
+ * Add service_role option when launching a Job.
8
+ * BUG: Fix job_flow_role option to be properly set when passed to Job in CLI launcher.
9
+
10
+ # 1.0.1
11
+
12
+ * `add_setup_pig_step_method_options` now allows `pig_version` to be configured
13
+ * `S3DistCp` now allows for the `s3_distcp_jar` to be configured
14
+
15
+ # 1.0.0
16
+
17
+ * 1 major enhancement
18
+ * Birthday!
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,23 @@
1
+ Copyright (c) 2013 ReturnPath, Inc.
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+
data/README.md ADDED
@@ -0,0 +1,240 @@
1
+ # ReturnPath EMR Tools
2
+
3
+ This is a Ruby library for creating & launching jobs on AWS's Elastic MapReduce
4
+ service. The library provides two basic tools: a set of classes to encapsulate
5
+ the data structures expected by the EMR client, and a set of Thor helpers to
6
+ simplify building job launchers.
7
+
8
+ ## Client Wrapper
9
+
10
+ The AWS EMR client is very low level, and basically expects a hash of values.
11
+ rp-emr provides wrappers for the basic data types and some helpers for building
12
+ collections. All objects are built using the
13
+ [assembler](https://github.com/benhamill/assembler) gem, so you can mix values
14
+ between method-call syntax and builder-block syntax.
15
+
16
+ The basic bits look like this:
17
+
18
+ ```ruby
19
+ # Executes a script before the cluster starts processing steps
20
+ bootstrap_action = RP::EMR::BootstrapAction.new(name: 'action name') do |a|
21
+ a.path = 's3://path_to_script_to_run'
22
+ a.args = ['--option value', '--other-option value']
23
+ end
24
+
25
+ # Runs a hadoop jar. This is the bare-bones version, you'll probably want to
26
+ # use one of the classes in lib/rp/emr/step
27
+ step = RP::EMR::Step.new(name: 'step name') do |s|
28
+ s.action_on_failure = 'CANCEL_AND_WAIT'
29
+ s.hadoop_jar_step = {
30
+ jar: 's3://path_to_jar',
31
+ args: ['--option value', '--other-option value'],
32
+ }
33
+ end
34
+
35
+ # Runs a pig script
36
+ pig_step = RP::EMR::Step::Pig.new(name: 'pig step') do |s|
37
+ s.script_path = '/local/path/to/pig_script.pig'
38
+ s.script_bucket = 'bucket_to_upload_script_to'
39
+ s.args = ['--args_to_append_to_job']
40
+ s.pig_params = {'PIG_PARAM' => 'value'}
41
+ s.pig_version = '0.11.1.1'
42
+ s.action_on_failure = 'CANCEL_AND_WAIT'
43
+ s.dry_run = false
44
+ end
45
+
46
+ # There are also steps for setting up pig, setting up debugging, using S3DistCP, etc
47
+
48
+ # Creates an instance group. As with RP::EMR::Step, you probably shouldn't be
49
+ # using this directly, just RP::EMR::InstanceGroups instead
50
+ instance_group = RP::EMR::InstanceGroup.new(name: 'custom instance group') do |ig|
51
+ ig.instance_role = 'MASTER'
52
+ ig.instance_type = 'c1.medium'
53
+ ig.instance_count = 100
54
+ ig.market = 'SPOT'
55
+ ig.bid_price = 2.0
56
+ end
57
+
58
+ # Defines the different instances groups to be used. All the options for
59
+ # RP::EMR::InstanceGroup are supported, along with a defulat instance type
60
+ instance_groups = RP::EMR::InstanceGroups.new do |ig|
61
+ ig.default_instance_type = 'c1.medium'
62
+
63
+ ig.master_instance_type = 'c3.xlarge'
64
+
65
+ ig.core_instance_count = 5
66
+
67
+ ig.task_instance_count = 100
68
+ ig.task_instance_market = 'SPOT'
69
+ ig.task_bid_price = 2.0
70
+ end
71
+
72
+ # Top-level instance definition
73
+ instances = RP::EMR::Instances.new do |i|
74
+ i.instance_groups = instance_groups
75
+ i.ec2_key_name = 'my_key_name'
76
+ i.hadoop_version = '2.0'
77
+ end
78
+
79
+ # Now we can construct the actual job
80
+ job = RP::EMR::Job.new do |j|
81
+ j.instances = instances
82
+ j.steps = [step, pig_step]
83
+ j.ami_version = :latest
84
+ j.bootstrap_actions = [bootstrap_action]
85
+ j.visible_to_all_users = true
86
+ j.job_flow_role = 'MyIAMRole'
87
+ j.tags = ['analysis']
88
+ end
89
+
90
+ # Launch the job using the AWS API
91
+ AWS::EMR.new.job_flows.create('job_name', job.to_hash)
92
+ ```
93
+
94
+
95
+ ## Thor Helpers
96
+
97
+ The API wrapper is all fine and dandy, but it's still a pain to work with. So
98
+ there's a set of Thor helpers to make building jobs easier - they define things
99
+ like defaults, option parsing, and other goodness.
100
+
101
+ The gem installs an script called `emr` which provides basic options if you want
102
+ to build jobs interactively
103
+
104
+ ```bash
105
+ bundle exec emr help
106
+ > Commands:
107
+ > emr add_pig_script_step JOB_ID SCRIPT_PATH # Add a Pig script step to an existing job
108
+ > emr add_rollup_step JOB_ID INPUT OUTPUT # Add a S3DistCp rollup step to an existing job
109
+ > emr add_setup_pig_step JOB_ID # Add a setup pig step to an existing job
110
+ > emr create_job JOB_NAME # Create an EMR job
111
+ > emr help [COMMAND] # Describe available commands or one specific command
112
+ >
113
+ > Options:
114
+ > -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
115
+ > -v, [--verbose], [--no-verbose] # Print lots of stuff
116
+ > [--dry-run], [--no-dry-run] # Don't actually talk to AWS
117
+ ```
118
+
119
+ While these can be useful, the real goal is to make it easy to roll your own
120
+ CLI using these as building blocks. This is accomplished by providing class-level
121
+ helpers to import the options used for each step, allowing you to invoke them
122
+ as modular components.
123
+
124
+ For example:
125
+
126
+ ```ruby
127
+ #!/usr/bin/env ruby
128
+
129
+ require 'rp/emr'
130
+ require 'thor'
131
+
132
+ class ExampleCLI < Thor
133
+ # This brings all the class-level helpers in
134
+ extend RP::EMR::CLI::TaskOptions
135
+
136
+ # Creates shared options like --dry-run and --verbose
137
+ cli_class_options
138
+
139
+ # We're going to write a CLI for launching a pig script. The first thing
140
+ # we do is give it a name (this is standard Thor)
141
+ desc "pig", "Test a pig script"
142
+
143
+ # We'll need to launch a cluster to do our computation with. This method adds
144
+ # the options we'll use to create the cluster. Values passed to the method are
145
+ # used as the defaults
146
+ create_job_method_options(
147
+ default_instance_type: 'm1.large',
148
+ core_instance_count: 2,
149
+ task_instance_count: 6,
150
+ job_flow_role: 'MyIAMRole',
151
+ )
152
+
153
+ # Here we're importing the options used to control how Pig is setup
154
+ add_setup_pig_step_method_options
155
+
156
+ # And here were importing options used to create a Pig step generally
157
+ add_pig_script_step_method_options(
158
+ script_bucket: 'my-emr-scripts-bucket',
159
+ )
160
+
161
+ # Let's define some options specific to the task we're trying to complete
162
+ method_option :output, default: 'counted_words'
163
+ def pig
164
+ script_path = File.expand_path('../count_words.pig', __FILE__)
165
+ input_path = "s3://my-input-bucket/words"
166
+ output_path = "s3://my-output-bucket/#{options[:output]}/#{Time.now.to_i}"
167
+
168
+ # These will be available in our Pig script as '$INPUT' and '$OUTPUT'
169
+ pig_step_args = { pig_params: options[:pig_params].merge(
170
+ 'INPUT' => input_path,
171
+ 'OUTPUT' => output_path,
172
+ )}
173
+
174
+ # Now that we've constructed our options, we'll use the Thor task in lib/rp/emr/cli
175
+ # to create a job flow. The task returns the job identifier, and we're passing
176
+ # the options hash that Thor parsed for us (this is why we did all that setup
177
+ # earlier)
178
+ job_id = invoke 'emr:create_job', ['Word Count Job'], options
179
+
180
+ # The job has been created, so we'll add a step to setup pig
181
+ invoke 'emr:add_setup_pig_step', [job_id], options
182
+
183
+ # And finally we'll add our pig script. Notice that we're merging the pig
184
+ # args into the options hash. We could also have passed these options as CLI
185
+ # options - this lets us to complicated stuff like date coersions in Ruby
186
+ invoke 'emr:add_pig_script_step', [job_id, script_path], options.merge(pig_step_args)
187
+ end
188
+ end
189
+
190
+ ExampleCLI.start
191
+ ```
192
+
193
+ Now, we can get a nice help page describing all the options available to us
194
+
195
+ ```bash
196
+ bundle exec ./word_count_cli --help
197
+ > Commands:
198
+ > word_count_cli help [COMMAND] # Describe available commands or one specific command
199
+ > work_count_cli pig # Test a pig script
200
+ >
201
+ > Options:
202
+ > -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
203
+ > -v, [--verbose], [--no-verbose] # Print lots of stuff
204
+ > [--dry-run], [--no-dry-run] # Don't actually talk to AWS
205
+
206
+ bundle exec ./word_count_cli help pig
207
+ > Usage:
208
+ > word_count_cli pig
209
+ >
210
+ > Options:
211
+ > -k, [--ec2-key-name=KEY_NAME] # An AWS keypair for the cluster. Useful if you want to shell into the cluster
212
+ > [--default-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster
213
+ > # Default: m1.large
214
+ > [--master-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster master group
215
+ > [--master-instance-count=N] # The number of task instances to create in the cluster master group
216
+ > [--core-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster core group
217
+ > [--core-instance-count=N] # The number of task instances to create in the cluster core group
218
+ > # Default: 2
219
+ > [--task-instance-type=INSTANCE_TYPE] # The EC2 instance type to use for the cluster task group
220
+ > [--task-instance-count=N] # The number of task instances to create in the cluster task group
221
+ > # Default: 6
222
+ > [--task-bid-price=N.NN] # If set, will use spot instances for task trackers with this bid price
223
+ > [--job-flow-role=IAM_ROLE] # IAM Role for the job flow
224
+ > # Default: MyIAMRole
225
+ > [--script-bucket=BUCKET] # The S3 bucket to use for storing the Pig script
226
+ > # Default: my-emr-scripts-bucket
227
+ > -p, [--pig-params=PARAM:VALUE] # Parameters to be passed to the pig script
228
+ > [--output=OUTPUT]
229
+ > -a, [--keep-alive], [--no-keep-alive] # Set to true if you want the cluster to stay alive after completion/failure
230
+ > -v, [--verbose], [--no-verbose] # Print lots of stuff
231
+ > [--dry-run], [--no-dry-run] # Don't actually talk to AWS
232
+
233
+ bundle exec ./word_count_cli pig --ouput foo --dry-run
234
+ > -----------
235
+ > Created job flow job_flow_id with ["Word Count Job"], {"keep_alive"=>false, "verbose"=>false, "dry_run"=>true, ...}
236
+ > -----------
237
+ > Added setup pig step to job_flow_id with ["job_flow_id"], {"keep_alive"=>false, "verbose"=>false, "dry_run"=>true, ...}
238
+ > -----------
239
+ > Added pig script step to job_flow_id with ["job_flow_id", "count_words.pig"], {"keep_alive"=>false, ...}
240
+ ```
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rp/emr'
4
+
5
+ RSpec::Core::RakeTask.new(:spec)
6
+ task :default => :spec
data/bin/emr ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/rp/emr', __FILE__)
4
+
5
+ RP::EMR::CLI.start
@@ -0,0 +1,38 @@
1
+ module RP
2
+ module EMR
3
+ # Bootstrap action wrapper
4
+ #
5
+ # @example
6
+ # def bootstrap_hadoop
7
+ # RP::EMR::BootstrapAction.new(
8
+ # name: 'Configure Hadoop',
9
+ # path: 's3://elasticmapreduce/bootstrap-actions/configure-hadoop',
10
+ # args: ['-c', 'fs.s3n.multipart.uploads.enabled=false']
11
+ # )
12
+ # end
13
+ #
14
+ # def bootstrap_daemons
15
+ # RP::EMR::BootstrapAction.new(
16
+ # name: 'Configure Daemons',
17
+ # path: 's3://elasticmapreduce/bootstrap-actions/configure-daemons',
18
+ # args: ['--namenode-heap-size=15000'],
19
+ # )
20
+ # end
21
+ #
22
+ class BootstrapAction
23
+ extend Assembler
24
+
25
+ assemble_from :name, :path, args: []
26
+
27
+ def to_hash
28
+ {
29
+ name: name,
30
+ script_bootstrap_action: {
31
+ path: path,
32
+ args: args,
33
+ },
34
+ }
35
+ end
36
+ end
37
+ end
38
+ end
data/lib/rp/emr/cli.rb ADDED
@@ -0,0 +1,249 @@
1
+ module RP
2
+ module EMR
3
+ class CLI < Thor
4
+ module TaskOptions
5
+ def cli_class_options
6
+ class_option :keep_alive, aliases: '-a', default: false, type: :boolean, desc: 'Set to true if you want the cluster to stay alive after completion/failure'
7
+ class_option :verbose, aliases: '-v', default: false, type: :boolean, desc: 'Print lots of stuff'
8
+ class_option :dry_run, default: false, type: :boolean, desc: "Don't actually talk to AWS"
9
+ end
10
+
11
+ def create_job_method_options(defaults = {})
12
+ method_option(:ec2_key_name,
13
+ default: defaults[:ec2_key_name],
14
+ aliases: '-k',
15
+ banner: 'KEY_NAME',
16
+ desc: "An AWS keypair for the cluster. Useful if you want to shell into the cluster",
17
+ )
18
+ method_option(:default_instance_type,
19
+ default: defaults[:default_instance_type],
20
+ banner: 'INSTANCE_TYPE',
21
+ desc: "The EC2 instance type to use for the cluster",
22
+ )
23
+ method_option(:master_instance_type,
24
+ default: defaults[:master_instance_type],
25
+ banner: 'INSTANCE_TYPE',
26
+ desc: "The EC2 instance type to use for the cluster master group",
27
+ )
28
+ method_option(:master_instance_count,
29
+ default: defaults[:master_instance_count],
30
+ type: :numeric,
31
+ banner: 'N',
32
+ desc: "The number of task instances to create in the cluster master group",
33
+ )
34
+ method_option(:core_instance_type,
35
+ default: defaults[:core_instance_type],
36
+ banner: 'INSTANCE_TYPE',
37
+ desc: "The EC2 instance type to use for the cluster core group",
38
+ )
39
+ method_option(:core_instance_count,
40
+ default: defaults[:core_instance_count],
41
+ type: :numeric,
42
+ banner: 'N',
43
+ desc: "The number of task instances to create in the cluster core group",
44
+ )
45
+ method_option(:task_instance_type,
46
+ default: defaults[:task_instance_type],
47
+ banner: 'INSTANCE_TYPE',
48
+ desc: "The EC2 instance type to use for the cluster task group",
49
+ )
50
+ method_option(:task_instance_count,
51
+ default: defaults[:task_instance_count],
52
+ type: :numeric,
53
+ banner: 'N',
54
+ desc: "The number of task instances to create in the cluster task group",
55
+ )
56
+ method_option(:task_bid_price,
57
+ default: defaults[:task_bid_price],
58
+ type: :numeric,
59
+ banner: 'N.NN',
60
+ desc: "If set, will use spot instances for task trackers with this bid price",
61
+ )
62
+ method_option(:job_flow_role,
63
+ default: defaults[:job_flow_role],
64
+ banner: 'IAM_ROLE',
65
+ desc: "IAM Role for the job flow",
66
+ )
67
+ method_option(:service_role,
68
+ default: defaults[:service_role],
69
+ banner: 'IAM_ROLE',
70
+ desc: "IAM Role for the service",
71
+ )
72
+ end
73
+
74
+ def add_setup_pig_step_method_options(defaults = {})
75
+ method_option(:pig_version,
76
+ default: defaults[:pig_version] || '0.11.1.1',
77
+ desc: 'Version of Pig to install'
78
+ )
79
+ end
80
+
81
+ def add_setup_hive_step_method_options(defaults = {})
82
+ method_option(:hive_version,
83
+ default: defaults[:hive_version] || 'latest',
84
+ desc: 'Version of Hive to install'
85
+ )
86
+ end
87
+
88
+ def add_rollup_step_method_options(defaults = {})
89
+ method_option(:rollup_input_pattern,
90
+ default: defaults[:rollup_input_pattern],
91
+ desc: 'Java-compatable regex to filter input',
92
+ )
93
+ method_option(:rollup_group_by,
94
+ default: defaults[:rollup_group_by],
95
+ desc: 'Java-compatable regex with a single capture group',
96
+ )
97
+ method_option(:rollup_target_size,
98
+ default: defaults[:rollup_target_size],
99
+ type: :numeric,
100
+ desc: 'The target file size for rolled up files',
101
+ )
102
+ end
103
+
104
+ def add_pig_script_step_method_options(defaults = {})
105
+ method_option(:script_bucket,
106
+ default: defaults[:script_bucket],
107
+ banner: 'BUCKET',
108
+ desc: 'The S3 bucket to use for storing the Pig script',
109
+ )
110
+ method_option(:pig_params,
111
+ default: defaults[:pig_params] || {},
112
+ aliases: '-p',
113
+ type: :hash,
114
+ banner: 'PARAM:VALUE',
115
+ desc: 'Parameters to be passed to the pig script',
116
+ )
117
+ end
118
+ end
119
+
120
+ extend TaskOptions
121
+
122
+ namespace :emr
123
+
124
+ cli_class_options
125
+
126
+ desc "create_job JOB_NAME", "Create an EMR job"
127
+ create_job_method_options
128
+ def create_job(job_name, *)
129
+ instances = RP::EMR::Instances.new do |i|
130
+ i.hadoop_version = '2.2.0'
131
+ i.ec2_key_name = options[:ec2_key_name] if options[:ec2_key_name]
132
+ i.keep_job_flow_alive_when_no_steps = options[:keep_alive]
133
+
134
+ i.instance_groups = RP::EMR::InstanceGroups.new do |ig|
135
+ ig.default_instance_type = options[:default_instance_type] if options[:default_instance_type]
136
+
137
+ ig.master_instance_type = options[:master_instance_type] if options[:master_instance_type]
138
+ ig.master_instance_count = options[:master_instance_count] if options[:master_instance_count]
139
+
140
+ ig.core_instance_type = options[:core_instance_type] if options[:core_instance_type]
141
+ ig.core_instance_count = options[:core_instance_count] if options[:core_instance_count]
142
+
143
+ ig.task_instance_type = options[:task_instance_type] if options[:task_instance_type]
144
+ ig.task_instance_count = options[:task_instance_count] if options[:task_instance_count]
145
+ ig.task_bid_price = options[:task_bid_price] if options[:task_bid_price]
146
+ end.to_a
147
+ end
148
+
149
+ setup_debugging_step = RP::EMR::Step::SetupDebugging.new do |s|
150
+ s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
151
+ end
152
+
153
+ job = RP::EMR::Job.new do |job|
154
+ job.log_uri = "s3://oib-mapreduce/logs/mosaic_analysis/#{job_name.underscore}"
155
+ job.instances = instances.to_hash
156
+ job.steps = [setup_debugging_step.to_hash]
157
+ job.job_flow_role = options[:job_flow_role] if options[:job_flow_role]
158
+ job.service_role = options[:service_role] if options[:service_role]
159
+ end
160
+
161
+ if options[:dry_run]
162
+ job_flow = OpenStruct.new(id: 'job_flow_id')
163
+ else
164
+ job_flow = AWS::EMR.new.job_flows.create(job_name, job.to_hash)
165
+ end
166
+ puts '-----------'
167
+ puts "Created job flow #{job_flow.id} with #{args}, #{options}"
168
+ pp job.to_hash if options[:verbose]
169
+
170
+ return job_flow.id
171
+ end
172
+
173
+ desc "add_setup_pig_step JOB_ID", "Add a setup pig step to an existing job"
174
+ add_setup_pig_step_method_options
175
+ def add_setup_pig_step(job_id, *)
176
+ job = AWS::EMR.new.job_flows[job_id]
177
+
178
+ step = RP::EMR::Step::SetupPig.new do |s|
179
+ s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
180
+ s.pig_version = options[:pig_version] if options[:pig_version]
181
+ end
182
+
183
+ job.add_steps([step.to_hash]) unless options[:dry_run]
184
+ puts '-----------'
185
+ puts "Added setup pig step to #{job.id} with #{args}, #{options}"
186
+ pp step.to_hash if options[:verbose]
187
+ end
188
+
189
+ desc "add_setup_hive_step JOB_ID", "Add a setup hive step to an existing job"
190
+ add_setup_hive_step_method_options
191
+ def add_setup_hive_step(job_id, *)
192
+ job = AWS::EMR.new.job_flows[job_id]
193
+
194
+ step = RP::EMR::Step::SetupHive.new do |s|
195
+ s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
196
+ s.hive_version = options[:hive_version] if options[:hive_version]
197
+ end
198
+
199
+ job.add_steps([step.to_hash]) unless options[:dry_run]
200
+ puts '-----------'
201
+ puts "Added setup hive step to #{job.id} with #{args}, #{options}"
202
+ pp step.to_hash if options[:verbose]
203
+ end
204
+
205
+ desc "add_rollup_step JOB_ID INPUT OUTPUT", "Add a S3DistCp rollup step to an existing job"
206
+ add_rollup_step_method_options
207
+ def add_rollup_step(job_id, input, output, *)
208
+ job = AWS::EMR.new.job_flows[job_id]
209
+
210
+ step = RP::EMR::Step::S3DistCp.new(
211
+ name: 'Rollup',
212
+ src: input,
213
+ dest: output,
214
+ ) do |s|
215
+ s.srcPattern = options[:rollup_input_pattern] if options[:rollup_input_pattern]
216
+ s.groupBy = options[:rollup_group_by] if options[:rollup_group_by]
217
+ s.targetSize = options[:rollup_target_size] if options[:rollup_target_size]
218
+ s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
219
+ end
220
+
221
+ job.add_steps([step.to_hash]) unless options[:dry_run]
222
+ puts '-----------'
223
+ puts "Added rollup step to #{job.id} with #{args}, #{options}"
224
+ pp step.to_hash if options[:verbose]
225
+ end
226
+
227
+ desc "add_pig_script_step JOB_ID SCRIPT_PATH", "Add a Pig script step to an existing job"
228
+ add_pig_script_step_method_options
229
+ def add_pig_script_step(job_id, script_path, *)
230
+ job = AWS::EMR.new.job_flows[job_id]
231
+
232
+ step = RP::EMR::Step::Pig.new(
233
+ name: 'Pig',
234
+ script_path: script_path,
235
+ script_bucket: options[:script_bucket],
236
+ ) do |s|
237
+ s.pig_params = options[:pig_params] if options[:pig_params]
238
+ s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
239
+ s.dry_run = options[:dry_run]
240
+ end
241
+
242
+ job.add_steps([step.to_hash]) unless options[:dry_run]
243
+ puts '-----------'
244
+ puts "Added pig script step to #{job.id} with #{args}, #{options}"
245
+ pp step.to_hash if options[:verbose]
246
+ end
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,36 @@
1
+ module RP
2
+ module EMR
3
+ class InstanceGroup
4
+ extend Assembler
5
+
6
+ assemble_from(
7
+ # Required params
8
+ :instance_role,
9
+ :instance_type,
10
+ :instance_count,
11
+
12
+ # Optional params
13
+ name: nil,
14
+ market: nil,
15
+ bid_price: nil,
16
+ )
17
+
18
+ def to_hash
19
+ {
20
+ name: name,
21
+ market: market,
22
+ instance_role: instance_role,
23
+ bid_price: bid_price.to_s,
24
+ instance_type: instance_type,
25
+ instance_count: instance_count,
26
+ }.reject { |k,v| !v || (v.respond_to?(:empty?) && v.empty?) }
27
+ end
28
+
29
+ private
30
+
31
+ def market
32
+ bid_price ? 'SPOT' : @market
33
+ end
34
+ end
35
+ end
36
+ end