RubyGems - rp-emr - Versions diffs - 1.0.3 - Mend

rp-emr 1.0.3

Files changed (37) hide show

checksums.yaml +7 -0
data/.gitignore +4 -0
data/.rspec +1 -0
data/CHANGELOG.md +18 -0
data/Gemfile +3 -0
data/LICENSE.txt +23 -0
data/README.md +240 -0
data/Rakefile +6 -0
data/bin/emr +5 -0
data/lib/rp/emr/bootstrap_action.rb +38 -0
data/lib/rp/emr/cli.rb +249 -0
data/lib/rp/emr/instance_group.rb +36 -0
data/lib/rp/emr/instance_groups.rb +66 -0
data/lib/rp/emr/instances.rb +36 -0
data/lib/rp/emr/job.rb +42 -0
data/lib/rp/emr/step/pig.rb +84 -0
data/lib/rp/emr/step/s3_dist_cp.rb +93 -0
data/lib/rp/emr/step/setup_debugging.rb +28 -0
data/lib/rp/emr/step/setup_hive.rb +36 -0
data/lib/rp/emr/step/setup_pig.rb +36 -0
data/lib/rp/emr/step.rb +21 -0
data/lib/rp/emr/version.rb +5 -0
data/lib/rp/emr.rb +26 -0
data/rp-emr.gemspec +31 -0
data/spec/rp/emr/bootstrap_action_spec.rb +23 -0
data/spec/rp/emr/instance_group_spec.rb +51 -0
data/spec/rp/emr/instance_groups_spec.rb +106 -0
data/spec/rp/emr/instances_spec.rb +23 -0
data/spec/rp/emr/job_spec.rb +31 -0
data/spec/rp/emr/step/pig_spec.rb +136 -0
data/spec/rp/emr/step/s3_dist_cp_step_spec.rb +83 -0
data/spec/rp/emr/step/setup_debugging_spec.rb +29 -0
data/spec/rp/emr/step/setup_pig_spec.rb +47 -0
data/spec/rp/emr/step_spec.rb +33 -0
data/spec/rp/emr_spec.rb +5 -0
data/spec/spec_helper.rb +10 -0
metadata +221 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 608a51bf27824ca9856bbdc548a0d810b0e5af65
+  data.tar.gz: 0f4d6fe64878316614c9bccafab5cee6e553bbb8
+SHA512:
+  metadata.gz: b6d2180e2f2bc109ffca80e3f531297dad50f94f7dfef79ce8bc21ac2010162e09fa2d7d87691031cf00784de4c93d71704822aec102800edce5d36c73609b1c
+  data.tar.gz: 213a4f7215cf3848cef5986a3c727e131f6494c32806b336cbf1c134b89b0330688e6ea4997be8783faa6a652d0a870dac105f30ff55711e5b2edef33e175e33

data/.gitignore ADDED Viewed

@@ -0,0 +1,4 @@
+.bundle
+.idea
+pkg/
+Gemfile.lock

data/.rspec ADDED Viewed

	@@ -0,0 +1 @@
1	+ --color

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,18 @@
+# 1.0.3
+  * Implement SetupHive step
+# 1.0.2
+  * Add service_role option when launching a Job.
+  * BUG: Fix job_flow_role option to be properly set when passed to Job in CLI launcher.
+# 1.0.1
+  * `add_setup_pig_step_method_options` now allows `pig_version` to be configured
+  * `S3DistCp` now allows for the `s3_distcp_jar` to be configured
+# 1.0.0
+* 1 major enhancement
+  * Birthday!

data/Gemfile ADDED Viewed

@@ -0,0 +1,3 @@
+source 'https://rubygems.org'
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,23 @@
+Copyright (c) 2013 ReturnPath, Inc.
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,240 @@
+# ReturnPath EMR Tools
+This is a Ruby library for creating & launching jobs on AWS's Elastic MapReduce
+service. The library provides two basic tools: a set of classes to encapsulate
+the data structures expected by the EMR client, and a set of Thor helpers to
+simplify building job launchers.
+## Client Wrapper
+The AWS EMR client is very low level, and basically expects a hash of values.
+rp-emr provides wrappers for the basic data types and some helpers for building
+collections.  All objects are built using the
+[assembler](https://github.com/benhamill/assembler) gem, so you can mix values
+between method-call syntax and builder-block syntax.
+The basic bits look like this:
+```ruby
+# Executes a script before the cluster starts processing steps
+bootstrap_action = RP::EMR::BootstrapAction.new(name: 'action name') do |a|
+  a.path = 's3://path_to_script_to_run'
+  a.args = ['--option value', '--other-option value']
+end
+# Runs a hadoop jar.  This is the bare-bones version, you'll probably want to
+# use one of the classes in lib/rp/emr/step
+step = RP::EMR::Step.new(name: 'step name') do |s|
+  s.action_on_failure = 'CANCEL_AND_WAIT'
+  s.hadoop_jar_step = {
+    jar: 's3://path_to_jar',
+    args: ['--option value', '--other-option value'],
+  }
+end
+# Runs a pig script
+pig_step = RP::EMR::Step::Pig.new(name: 'pig step') do |s|
+  s.script_path = '/local/path/to/pig_script.pig'
+  s.script_bucket = 'bucket_to_upload_script_to'
+  s.args = ['--args_to_append_to_job']
+  s.pig_params = {'PIG_PARAM' => 'value'}
+  s.pig_version = '0.11.1.1'
+  s.action_on_failure = 'CANCEL_AND_WAIT'
+  s.dry_run = false
+end
+# There are also steps for setting up pig, setting up debugging, using S3DistCP, etc
+# Creates an instance group.  As with RP::EMR::Step, you probably shouldn't be
+# using this directly, just RP::EMR::InstanceGroups instead
+instance_group = RP::EMR::InstanceGroup.new(name: 'custom instance group') do |ig|
+  ig.instance_role = 'MASTER'
+  ig.instance_type = 'c1.medium'
+  ig.instance_count = 100
+  ig.market = 'SPOT'
+  ig.bid_price = 2.0
+end
+# Defines the different instances groups to be used.  All the options for
+# RP::EMR::InstanceGroup are supported, along with a defulat instance type
+instance_groups = RP::EMR::InstanceGroups.new do |ig|
+  ig.default_instance_type = 'c1.medium'
+  ig.master_instance_type = 'c3.xlarge'
+  ig.core_instance_count = 5
+  ig.task_instance_count = 100
+  ig.task_instance_market = 'SPOT'
+  ig.task_bid_price = 2.0
+end
+# Top-level instance definition
+instances = RP::EMR::Instances.new do |i|
+  i.instance_groups = instance_groups
+  i.ec2_key_name = 'my_key_name'
+  i.hadoop_version = '2.0'
+end
+# Now we can construct the actual job
+job = RP::EMR::Job.new do |j|
+  j.instances = instances
+  j.steps = [step, pig_step]
+  j.ami_version = :latest
+  j.bootstrap_actions = [bootstrap_action]
+  j.visible_to_all_users = true
+  j.job_flow_role = 'MyIAMRole'
+  j.tags = ['analysis']
+end
+# Launch the job using the AWS API
+AWS::EMR.new.job_flows.create('job_name', job.to_hash)
+```
+## Thor Helpers
+The API wrapper is all fine and dandy, but it's still a pain to work with.  So
+there's a set of Thor helpers to make building jobs easier - they define things
+like defaults, option parsing, and other goodness.
+The gem installs an script called `emr` which provides basic options if you want
+to build jobs interactively
+```bash
+bundle exec emr help
+> Commands:
+>   emr add_pig_script_step JOB_ID SCRIPT_PATH  # Add a Pig script step to an existing job
+>   emr add_rollup_step JOB_ID INPUT OUTPUT     # Add a S3DistCp rollup step to an existing job
+>   emr add_setup_pig_step JOB_ID               # Add a setup pig step to an existing job
+>   emr create_job JOB_NAME                     # Create an EMR job
+>   emr help [COMMAND]                          # Describe available commands or one specific command
+>
+> Options:
+>   -a, [--keep-alive], [--no-keep-alive]  # Set to true if you want the cluster to stay alive after completion/failure
+>   -v, [--verbose], [--no-verbose]        # Print lots of stuff
+>       [--dry-run], [--no-dry-run]        # Don't actually talk to AWS
+```
+While these can be useful, the real goal is to make it easy to roll your own
+CLI using these as building blocks.  This is accomplished by providing class-level
+helpers to import the options used for each step, allowing you to invoke them
+as modular components.
+For example:
+```ruby
+#!/usr/bin/env ruby
+require 'rp/emr'
+require 'thor'
+class ExampleCLI < Thor
+  # This brings all the class-level helpers in
+  extend RP::EMR::CLI::TaskOptions
+  # Creates shared options like --dry-run and --verbose
+  cli_class_options
+  # We're going to write a CLI for launching a pig script.  The first thing
+  # we do is give it a name (this is standard Thor)
+  desc "pig", "Test a pig script"
+  # We'll need to launch a cluster to do our computation with.  This method adds
+  # the options we'll use to create the cluster.  Values passed to the method are
+  # used as the defaults
+  create_job_method_options(
+    default_instance_type: 'm1.large',
+    core_instance_count: 2,
+    task_instance_count: 6,
+    job_flow_role: 'MyIAMRole',
+  )
+  # Here we're importing the options used to control how Pig is setup
+  add_setup_pig_step_method_options
+  # And here were importing options used to create a Pig step generally
+  add_pig_script_step_method_options(
+    script_bucket: 'my-emr-scripts-bucket',
+  )
+  # Let's define some options specific to the task we're trying to complete
+  method_option :output, default: 'counted_words'
+  def pig
+    script_path   = File.expand_path('../count_words.pig', __FILE__)
+    input_path    = "s3://my-input-bucket/words"
+    output_path   = "s3://my-output-bucket/#{options[:output]}/#{Time.now.to_i}"
+    # These will be available in our Pig script as '$INPUT' and '$OUTPUT'
+    pig_step_args = { pig_params: options[:pig_params].merge(
+      'INPUT'   => input_path,
+      'OUTPUT'  => output_path,
+    )}
+    # Now that we've constructed our options, we'll use the Thor task in lib/rp/emr/cli
+    # to create a job flow.  The task returns the job identifier, and we're passing
+    # the options hash that Thor parsed for us (this is why we did all that setup
+    # earlier)
+    job_id = invoke 'emr:create_job', ['Word Count Job'], options
+    # The job has been created, so we'll add a step to setup pig
+    invoke 'emr:add_setup_pig_step', [job_id], options
+    # And finally we'll add our pig script.  Notice that we're merging the pig
+    # args into the options hash.  We could also have passed these options as CLI
+    # options - this lets us to complicated stuff like date coersions in Ruby
+    invoke 'emr:add_pig_script_step', [job_id, script_path], options.merge(pig_step_args)
+  end
+end
+ExampleCLI.start
+```
+Now, we can get a nice help page describing all the options available to us
+```bash
+bundle exec ./word_count_cli --help
+> Commands:
+>   word_count_cli help [COMMAND]  # Describe available commands or one specific command
+>   work_count_cli pig             # Test a pig script
+>
+> Options:
+>   -a, [--keep-alive], [--no-keep-alive]  # Set to true if you want the cluster to stay alive after completion/failure
+>   -v, [--verbose], [--no-verbose]        # Print lots of stuff
+>       [--dry-run], [--no-dry-run]        # Don't actually talk to AWS
+bundle exec ./word_count_cli help pig
+> Usage:
+>   word_count_cli pig
+>
+> Options:
+>   -k, [--ec2-key-name=KEY_NAME]                # An AWS keypair for the cluster.  Useful if you want to shell into the cluster
+>       [--default-instance-type=INSTANCE_TYPE]  # The EC2 instance type to use for the cluster
+>                                                # Default: m1.large
+>       [--master-instance-type=INSTANCE_TYPE]   # The EC2 instance type to use for the cluster master group
+>       [--master-instance-count=N]              # The number of task instances to create in the cluster master group
+>       [--core-instance-type=INSTANCE_TYPE]     # The EC2 instance type to use for the cluster core group
+>       [--core-instance-count=N]                # The number of task instances to create in the cluster core group
+>                                                # Default: 2
+>       [--task-instance-type=INSTANCE_TYPE]     # The EC2 instance type to use for the cluster task group
+>       [--task-instance-count=N]                # The number of task instances to create in the cluster task group
+>                                                # Default: 6
+>       [--task-bid-price=N.NN]                  # If set, will use spot instances for task trackers with this bid price
+>       [--job-flow-role=IAM_ROLE]               # IAM Role for the job flow
+>                                                # Default: MyIAMRole
+>       [--script-bucket=BUCKET]                 # The S3 bucket to use for storing the Pig script
+>                                                # Default: my-emr-scripts-bucket
+>   -p, [--pig-params=PARAM:VALUE]               # Parameters to be passed to the pig script
+>       [--output=OUTPUT]
+>   -a, [--keep-alive], [--no-keep-alive]        # Set to true if you want the cluster to stay alive after completion/failure
+>   -v, [--verbose], [--no-verbose]              # Print lots of stuff
+>       [--dry-run], [--no-dry-run]              # Don't actually talk to AWS
+bundle exec ./word_count_cli pig --ouput foo --dry-run
+> -----------
+> Created job flow job_flow_id with ["Word Count Job"], {"keep_alive"=>false, "verbose"=>false, "dry_run"=>true, ...}
+> -----------
+> Added setup pig step to job_flow_id with ["job_flow_id"], {"keep_alive"=>false, "verbose"=>false, "dry_run"=>true, ...}
+> -----------
+> Added pig script step to job_flow_id with ["job_flow_id", "count_words.pig"], {"keep_alive"=>false, ...}
+```

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require 'bundler/gem_tasks'
+require 'rp/emr'
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/emr ADDED Viewed

@@ -0,0 +1,5 @@
+#!/usr/bin/env ruby
+require File.expand_path('../../lib/rp/emr', __FILE__)
+RP::EMR::CLI.start

data/lib/rp/emr/bootstrap_action.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module RP
+  module EMR
+    # Bootstrap action wrapper
+    #
+    # @example
+    #   def bootstrap_hadoop
+    #     RP::EMR::BootstrapAction.new(
+    #       name: 'Configure Hadoop',
+    #       path: 's3://elasticmapreduce/bootstrap-actions/configure-hadoop',
+    #       args: ['-c', 'fs.s3n.multipart.uploads.enabled=false']
+    #     )
+    #   end
+    #
+    #   def bootstrap_daemons
+    #     RP::EMR::BootstrapAction.new(
+    #       name: 'Configure Daemons',
+    #       path: 's3://elasticmapreduce/bootstrap-actions/configure-daemons',
+    #       args: ['--namenode-heap-size=15000'],
+    #     )
+    #   end
+    #
+    class BootstrapAction
+      extend Assembler
+      assemble_from :name, :path, args: []
+      def to_hash
+        {
+          name: name,
+          script_bootstrap_action: {
+            path: path,
+            args: args,
+          },
+        }
+      end
+    end
+  end
+end

data/lib/rp/emr/cli.rb ADDED Viewed

@@ -0,0 +1,249 @@
+module RP
+  module EMR
+    class CLI < Thor
+      module TaskOptions
+        def cli_class_options
+          class_option :keep_alive, aliases: '-a', default: false, type: :boolean, desc: 'Set to true if you want the cluster to stay alive after completion/failure'
+          class_option :verbose, aliases: '-v', default: false, type: :boolean, desc: 'Print lots of stuff'
+          class_option :dry_run, default: false, type: :boolean, desc: "Don't actually talk to AWS"
+        end
+        def create_job_method_options(defaults = {})
+          method_option(:ec2_key_name,
+            default: defaults[:ec2_key_name],
+            aliases: '-k',
+            banner: 'KEY_NAME',
+            desc: "An AWS keypair for the cluster.  Useful if you want to shell into the cluster",
+          )
+          method_option(:default_instance_type,
+            default: defaults[:default_instance_type],
+            banner: 'INSTANCE_TYPE',
+            desc: "The EC2 instance type to use for the cluster",
+          )
+          method_option(:master_instance_type,
+            default: defaults[:master_instance_type],
+            banner: 'INSTANCE_TYPE',
+            desc: "The EC2 instance type to use for the cluster master group",
+          )
+          method_option(:master_instance_count,
+            default: defaults[:master_instance_count],
+            type: :numeric,
+            banner: 'N',
+            desc: "The number of task instances to create in the cluster master group",
+          )
+          method_option(:core_instance_type,
+            default: defaults[:core_instance_type],
+            banner: 'INSTANCE_TYPE',
+            desc: "The EC2 instance type to use for the cluster core group",
+          )
+          method_option(:core_instance_count,
+            default: defaults[:core_instance_count],
+            type: :numeric,
+            banner: 'N',
+            desc: "The number of task instances to create in the cluster core group",
+          )
+          method_option(:task_instance_type,
+            default: defaults[:task_instance_type],
+            banner: 'INSTANCE_TYPE',
+            desc: "The EC2 instance type to use for the cluster task group",
+          )
+          method_option(:task_instance_count,
+            default: defaults[:task_instance_count],
+            type: :numeric,
+            banner: 'N',
+            desc: "The number of task instances to create in the cluster task group",
+          )
+          method_option(:task_bid_price,
+            default: defaults[:task_bid_price],
+            type: :numeric,
+            banner: 'N.NN',
+            desc: "If set, will use spot instances for task trackers with this bid price",
+          )
+          method_option(:job_flow_role,
+            default: defaults[:job_flow_role],
+            banner: 'IAM_ROLE',
+            desc: "IAM Role for the job flow",
+          )
+          method_option(:service_role,
+                        default: defaults[:service_role],
+                        banner: 'IAM_ROLE',
+                        desc: "IAM Role for the service",
+          )
+        end
+        def add_setup_pig_step_method_options(defaults = {})
+          method_option(:pig_version,
+            default: defaults[:pig_version] || '0.11.1.1',
+            desc: 'Version of Pig to install'
+          )
+        end
+        def add_setup_hive_step_method_options(defaults = {})
+          method_option(:hive_version,
+                        default: defaults[:hive_version] || 'latest',
+                        desc: 'Version of Hive to install'
+          )
+        end
+        def add_rollup_step_method_options(defaults = {})
+          method_option(:rollup_input_pattern,
+            default: defaults[:rollup_input_pattern],
+            desc: 'Java-compatable regex to filter input',
+          )
+          method_option(:rollup_group_by,
+            default: defaults[:rollup_group_by],
+            desc: 'Java-compatable regex with a single capture group',
+          )
+          method_option(:rollup_target_size,
+            default: defaults[:rollup_target_size],
+            type: :numeric,
+            desc: 'The target file size for rolled up files',
+          )
+        end
+        def add_pig_script_step_method_options(defaults = {})
+          method_option(:script_bucket,
+            default: defaults[:script_bucket],
+            banner: 'BUCKET',
+            desc: 'The S3 bucket to use for storing the Pig script',
+          )
+          method_option(:pig_params,
+            default: defaults[:pig_params] || {},
+            aliases: '-p',
+            type: :hash,
+            banner: 'PARAM:VALUE',
+            desc: 'Parameters to be passed to the pig script',
+          )
+        end
+      end
+      extend TaskOptions
+      namespace :emr
+      cli_class_options
+      desc "create_job JOB_NAME", "Create an EMR job"
+      create_job_method_options
+      def create_job(job_name, *)
+        instances = RP::EMR::Instances.new do |i|
+          i.hadoop_version = '2.2.0'
+          i.ec2_key_name = options[:ec2_key_name] if options[:ec2_key_name]
+          i.keep_job_flow_alive_when_no_steps = options[:keep_alive]
+          i.instance_groups = RP::EMR::InstanceGroups.new do |ig|
+            ig.default_instance_type = options[:default_instance_type] if options[:default_instance_type]
+            ig.master_instance_type = options[:master_instance_type] if options[:master_instance_type]
+            ig.master_instance_count = options[:master_instance_count] if options[:master_instance_count]
+            ig.core_instance_type = options[:core_instance_type] if options[:core_instance_type]
+            ig.core_instance_count = options[:core_instance_count] if options[:core_instance_count]
+            ig.task_instance_type = options[:task_instance_type] if options[:task_instance_type]
+            ig.task_instance_count = options[:task_instance_count] if options[:task_instance_count]
+            ig.task_bid_price = options[:task_bid_price] if options[:task_bid_price]
+          end.to_a
+        end
+        setup_debugging_step = RP::EMR::Step::SetupDebugging.new do |s|
+          s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
+        end
+        job = RP::EMR::Job.new do |job|
+          job.log_uri = "s3://oib-mapreduce/logs/mosaic_analysis/#{job_name.underscore}"
+          job.instances = instances.to_hash
+          job.steps = [setup_debugging_step.to_hash]
+          job.job_flow_role = options[:job_flow_role] if options[:job_flow_role]
+          job.service_role = options[:service_role] if options[:service_role]
+        end
+        if options[:dry_run]
+          job_flow = OpenStruct.new(id: 'job_flow_id')
+        else
+          job_flow = AWS::EMR.new.job_flows.create(job_name, job.to_hash)
+        end
+        puts '-----------'
+        puts "Created job flow #{job_flow.id} with #{args}, #{options}"
+        pp job.to_hash if options[:verbose]
+        return job_flow.id
+      end
+      desc "add_setup_pig_step JOB_ID", "Add a setup pig step to an existing job"
+      add_setup_pig_step_method_options
+      def add_setup_pig_step(job_id, *)
+        job = AWS::EMR.new.job_flows[job_id]
+        step = RP::EMR::Step::SetupPig.new do |s|
+          s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
+          s.pig_version = options[:pig_version] if options[:pig_version]
+        end
+        job.add_steps([step.to_hash]) unless options[:dry_run]
+        puts '-----------'
+        puts "Added setup pig step to #{job.id} with #{args}, #{options}"
+        pp step.to_hash if options[:verbose]
+      end
+      desc "add_setup_hive_step JOB_ID", "Add a setup hive step to an existing job"
+      add_setup_hive_step_method_options
+      def add_setup_hive_step(job_id, *)
+        job = AWS::EMR.new.job_flows[job_id]
+        step = RP::EMR::Step::SetupHive.new do |s|
+          s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
+          s.hive_version = options[:hive_version] if options[:hive_version]
+        end
+        job.add_steps([step.to_hash]) unless options[:dry_run]
+        puts '-----------'
+        puts "Added setup hive step to #{job.id} with #{args}, #{options}"
+        pp step.to_hash if options[:verbose]
+      end
+      desc "add_rollup_step JOB_ID INPUT OUTPUT", "Add a S3DistCp rollup step to an existing job"
+      add_rollup_step_method_options
+      def add_rollup_step(job_id, input, output, *)
+        job = AWS::EMR.new.job_flows[job_id]
+        step = RP::EMR::Step::S3DistCp.new(
+          name: 'Rollup',
+          src: input,
+          dest: output,
+        ) do |s|
+          s.srcPattern = options[:rollup_input_pattern] if options[:rollup_input_pattern]
+          s.groupBy = options[:rollup_group_by] if options[:rollup_group_by]
+          s.targetSize = options[:rollup_target_size] if options[:rollup_target_size]
+          s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
+        end
+        job.add_steps([step.to_hash]) unless options[:dry_run]
+        puts '-----------'
+        puts "Added rollup step to #{job.id} with #{args}, #{options}"
+        pp step.to_hash if options[:verbose]
+      end
+      desc "add_pig_script_step JOB_ID SCRIPT_PATH", "Add a Pig script step to an existing job"
+      add_pig_script_step_method_options
+      def add_pig_script_step(job_id, script_path, *)
+        job = AWS::EMR.new.job_flows[job_id]
+        step = RP::EMR::Step::Pig.new(
+          name: 'Pig',
+          script_path: script_path,
+          script_bucket: options[:script_bucket],
+        ) do |s|
+          s.pig_params = options[:pig_params] if options[:pig_params]
+          s.action_on_failure = 'CANCEL_AND_WAIT' if options[:keep_alive]
+          s.dry_run = options[:dry_run]
+        end
+        job.add_steps([step.to_hash]) unless options[:dry_run]
+        puts '-----------'
+        puts "Added pig script step to #{job.id} with #{args}, #{options}"
+        pp step.to_hash if options[:verbose]
+      end
+    end
+  end
+end

data/lib/rp/emr/instance_group.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module RP
+  module EMR
+    class InstanceGroup
+      extend Assembler
+      assemble_from(
+        # Required params
+        :instance_role,
+        :instance_type,
+        :instance_count,
+        # Optional params
+        name: nil,
+        market: nil,
+        bid_price: nil,
+      )
+      def to_hash
+        {
+          name: name,
+          market: market,
+          instance_role: instance_role,
+          bid_price: bid_price.to_s,
+          instance_type: instance_type,
+          instance_count: instance_count,
+        }.reject { |k,v| !v || (v.respond_to?(:empty?) && v.empty?) }
+      end
+      private
+      def market
+        bid_price ? 'SPOT' : @market
+      end
+    end
+  end
+end