rp-emr 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ module RP
2
+ module EMR
3
+ class InstanceGroups
4
+ extend Assembler
5
+
6
+ assemble_from(
7
+ default_instance_type: 't1.micro',
8
+
9
+ master_instance_type: nil,
10
+ master_instance_count: 1,
11
+ master_market: 'ON_DEMAND',
12
+ master_bid_price: nil,
13
+
14
+ core_instance_type: nil,
15
+ core_instance_count: 1,
16
+ core_market: 'ON_DEMAND',
17
+ core_bid_price: nil,
18
+
19
+ task_instance_type: nil,
20
+ task_instance_count: 1,
21
+ task_market: 'ON_DEMAND',
22
+ task_bid_price: nil,
23
+ )
24
+
25
+ def to_a
26
+ [
27
+ master_instance_group.to_hash,
28
+ core_instance_group.to_hash,
29
+ task_instance_group.to_hash,
30
+ ].reject { |h| h[:instance_count] == 0 }
31
+ end
32
+
33
+ private
34
+
35
+ def master_instance_group
36
+ RP::EMR::InstanceGroup.new do |ig|
37
+ ig.instance_role = 'MASTER'
38
+ ig.instance_type = master_instance_type || default_instance_type
39
+ ig.instance_count = master_instance_count
40
+ ig.market = master_market
41
+ ig.bid_price = master_bid_price
42
+ end
43
+ end
44
+
45
+ def core_instance_group
46
+ RP::EMR::InstanceGroup.new do |ig|
47
+ ig.instance_role = 'CORE'
48
+ ig.instance_type = core_instance_type || default_instance_type
49
+ ig.instance_count = core_instance_count
50
+ ig.market = core_market
51
+ ig.bid_price = core_bid_price
52
+ end
53
+ end
54
+
55
+ def task_instance_group
56
+ RP::EMR::InstanceGroup.new do |ig|
57
+ ig.instance_role = 'TASK'
58
+ ig.instance_type = task_instance_type || default_instance_type
59
+ ig.instance_count = task_instance_count
60
+ ig.market = task_market
61
+ ig.bid_price = task_bid_price
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,36 @@
1
+ module RP
2
+ module EMR
3
+ class Instances
4
+ extend Assembler
5
+
6
+ assemble_from(
7
+ # Optional params
8
+ master_instance_type: nil,
9
+ slave_instance_type: nil,
10
+ instance_count: nil,
11
+ instance_groups: nil,
12
+ ec2_key_name: nil,
13
+ placement: nil,
14
+ keep_job_flow_alive_when_no_steps: nil,
15
+ termination_protected: nil,
16
+ hadoop_version: nil,
17
+ ec2_subnet_id: nil,
18
+ )
19
+
20
+ def to_hash
21
+ {
22
+ master_instance_type: master_instance_type,
23
+ slave_instance_type: slave_instance_type,
24
+ instance_count: instance_count,
25
+ instance_groups: instance_groups,
26
+ ec2_key_name: ec2_key_name,
27
+ placement: placement,
28
+ keep_job_flow_alive_when_no_steps: keep_job_flow_alive_when_no_steps,
29
+ termination_protected: termination_protected,
30
+ hadoop_version: hadoop_version,
31
+ ec2_subnet_id: ec2_subnet_id,
32
+ }.reject { |k,v| !v || (v.respond_to?(:empty?) && v.empty?) }
33
+ end
34
+ end
35
+ end
36
+ end
data/lib/rp/emr/job.rb ADDED
@@ -0,0 +1,42 @@
1
+ module RP
2
+ module EMR
3
+ class Job
4
+ extend Assembler
5
+
6
+ assemble_from(
7
+ # Required params
8
+ :instances,
9
+
10
+ # Optional params
11
+ steps: nil,
12
+ log_uri: nil,
13
+ additional_info: nil,
14
+ ami_version: :latest,
15
+ bootstrap_actions: nil,
16
+ supported_products: nil,
17
+ new_supported_products: nil,
18
+ visible_to_all_users: true,
19
+ job_flow_role: nil,
20
+ service_role: nil,
21
+ tags: nil,
22
+ )
23
+
24
+ def to_hash
25
+ {
26
+ instances: instances,
27
+ log_uri: log_uri,
28
+ additional_info: additional_info,
29
+ ami_version: ami_version.to_s,
30
+ steps: steps,
31
+ bootstrap_actions: bootstrap_actions,
32
+ supported_products: supported_products,
33
+ new_supported_products: new_supported_products,
34
+ visible_to_all_users: visible_to_all_users,
35
+ job_flow_role: job_flow_role,
36
+ service_role: service_role,
37
+ tags: tags,
38
+ }.reject { |k,v| !v || (v.respond_to?(:empty?) && v.empty?) }
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,84 @@
1
+ require 'digest/md5'
2
+
3
+ module RP
4
+ module EMR
5
+ class Step
6
+ class Pig
7
+ extend Assembler
8
+
9
+ assemble_from(
10
+ :name,
11
+ :script_path,
12
+ :script_bucket,
13
+ args: [],
14
+ pig_params: {},
15
+ pig_version: '0.11.1.1',
16
+ action_on_failure: nil,
17
+ dry_run: false,
18
+ )
19
+
20
+ def to_hash
21
+ @hash ||= begin
22
+ upload_script! unless dry_run
23
+ step.to_hash
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def upload_script!
30
+ # puts "Uploading to s3://#{script_bucket}/#{script_key}"
31
+ s3.buckets[script_bucket].objects[script_key].write(script)
32
+ end
33
+
34
+ def script
35
+ @script ||= File.open(script_path, 'r').read
36
+ end
37
+
38
+ def script_key
39
+ @script_key ||= begin
40
+ hash = Digest::MD5.hexdigest(script)
41
+ "scripts/emr_gem/#{File.basename(script_path, '.pig')}_#{hash}.pig"
42
+ end
43
+ end
44
+
45
+ def script_url
46
+ "s3://#{script_bucket}/#{script_key}"
47
+ end
48
+
49
+ def step
50
+ RP::EMR::Step.new(
51
+ name: name,
52
+ action_on_failure: action_on_failure,
53
+ hadoop_jar_step: {
54
+ jar: 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
55
+ args: hadoop_jar_base_args + args + formatted_params,
56
+ }
57
+ )
58
+ end
59
+
60
+ def hadoop_jar_base_args
61
+ [
62
+ 's3://us-east-1.elasticmapreduce/libs/pig/pig-script',
63
+ '--base-path', 's3://us-east-1.elasticmapreduce/libs/pig/',
64
+ '--pig-versions', pig_version,
65
+ '--run-pig-script',
66
+ ]
67
+ end
68
+
69
+ def formatted_params
70
+ [
71
+ '--args',
72
+ '-f', script_url,
73
+ ] + pig_params.
74
+ reject { |k, v| v.nil? }.
75
+ flat_map { |k, v| ['-p', "#{k}=#{v}"] }
76
+ end
77
+
78
+ def s3
79
+ AWS::S3.new
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,93 @@
1
+ module RP
2
+ module EMR
3
+ class Step
4
+ # Create a S3DistCp step
5
+ # http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/UsingEMR_s3distcp.html
6
+ #
7
+ # Handles annoying details like argument escaping
8
+ #
9
+ # Accepts all the input parameters listed in the documentation as of version 1.0.7.
10
+ #
11
+ # @example
12
+ # step = S3DistCpStep.new do |s|
13
+ # s.src = 's3://bucket/input/prefix/' # Note this is NOT Hadoop's glob syntax
14
+ # s.dest = 's3://bucket/output/path'
15
+ # s.srcPattern = 's3://bucket/input/prefix/[foo|bar].*\.eml' # Input regex - see Java's regex docs
16
+ # s.groupBy = '.*([a-z0-9]{2}).tsv' # Note that you need a capture group
17
+ # s.targetSize = 120.megabytes
18
+ # s.compression = 'snappy'
19
+ # s.deleteOnSuccess = true
20
+ # end
21
+ #
22
+ # step.to_hash # => Ruby hash ready for use in :steps key of a job
23
+ #
24
+ class S3DistCp
25
+ extend Assembler
26
+
27
+ DEFAULT_S3_DISTCP_JAR = '/home/hadoop/lib/emr-s3distcp-1.0.jar'
28
+
29
+ HASH_FIELDS = [
30
+ :src,
31
+ :dest,
32
+ :groupBy,
33
+ :targetSize,
34
+ :outputCodec,
35
+ :multipartUploadChunkSize,
36
+ :startingIndex,
37
+ :outputManifest,
38
+ :previousManifest,
39
+ :s3Endpoint,
40
+ :srcPattern,
41
+ ]
42
+
43
+ BOOLEAN_FIELDS = [
44
+ :s3ServerSideEncryption,
45
+ :deleteOnSuccess,
46
+ :disableMultipartUpload,
47
+ :numberFiles,
48
+ :copyFromManifest,
49
+ ]
50
+
51
+ assemble_from(name: 'S3DistCp',
52
+ action_on_failure: nil,
53
+ s3_distcp_jar: DEFAULT_S3_DISTCP_JAR)
54
+ assemble_from(Hash[HASH_FIELDS.map { |f| [f, nil] }])
55
+ assemble_from(Hash[BOOLEAN_FIELDS.map { |f| [f, false] }])
56
+
57
+ def to_hash
58
+ step.to_hash
59
+ end
60
+
61
+ private
62
+
63
+ def step
64
+ RP::EMR::Step.new(
65
+ name: name,
66
+ action_on_failure: action_on_failure,
67
+ hadoop_jar_step: {
68
+ jar: s3_distcp_jar,
69
+ args: hash_field_args + boolean_fields_args,
70
+ }
71
+ )
72
+ end
73
+
74
+ def hash_field_args
75
+ HASH_FIELDS.each do |f|
76
+ raise ArgumentError, "I don't know how to handle whitespace" if send(f) =~ / /
77
+ end
78
+
79
+ HASH_FIELDS.
80
+ map { |f| [f, send(f)] }.
81
+ reject { |k, v| v.nil? }.
82
+ flat_map { |k, v| ["--#{k}", v.to_s] }
83
+ end
84
+
85
+ def boolean_fields_args
86
+ BOOLEAN_FIELDS.
87
+ reject { |f| !send(f) }.
88
+ map { |f| "--#{f}" }
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,28 @@
1
+ module RP
2
+ module EMR
3
+ class Step
4
+ class SetupDebugging
5
+ extend Assembler
6
+
7
+ assemble_from action_on_failure: nil
8
+
9
+ def to_hash
10
+ step.to_hash
11
+ end
12
+
13
+ private
14
+
15
+ def step
16
+ RP::EMR::Step.new(
17
+ name: "Setup Hadoop Debugging",
18
+ action_on_failure: action_on_failure,
19
+ hadoop_jar_step: {
20
+ jar: 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
21
+ args: ['s3://us-east-1.elasticmapreduce/libs/state-pusher/0.1/fetch'],
22
+ }
23
+ )
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,36 @@
1
+ module RP
2
+ module EMR
3
+ class Step
4
+ class SetupHive
5
+ extend Assembler
6
+
7
+ assemble_from(
8
+ hive_version: 'latest',
9
+ action_on_failure: nil,
10
+ )
11
+
12
+ def to_hash
13
+ step.to_hash
14
+ end
15
+
16
+ private
17
+
18
+ def step
19
+ RP::EMR::Step.new(
20
+ name: "Setup Hive",
21
+ action_on_failure: action_on_failure,
22
+ hadoop_jar_step: {
23
+ jar: 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
24
+ args: [
25
+ 's3://us-east-1.elasticmapreduce/libs/hive/hive-script',
26
+ '--base-path', 's3://us-east-1.elasticmapreduce/libs/hive/',
27
+ '--install-hive',
28
+ '--hive-versions', hive_version,
29
+ ]
30
+ }
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,36 @@
1
+ module RP
2
+ module EMR
3
+ class Step
4
+ class SetupPig
5
+ extend Assembler
6
+
7
+ assemble_from(
8
+ pig_version: '0.11.1.1',
9
+ action_on_failure: nil,
10
+ )
11
+
12
+ def to_hash
13
+ step.to_hash
14
+ end
15
+
16
+ private
17
+
18
+ def step
19
+ RP::EMR::Step.new(
20
+ name: "Setup Pig",
21
+ action_on_failure: action_on_failure,
22
+ hadoop_jar_step: {
23
+ jar: 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
24
+ args: [
25
+ 's3://us-east-1.elasticmapreduce/libs/pig/pig-script',
26
+ '--base-path', 's3://us-east-1.elasticmapreduce/libs/pig/',
27
+ '--install-pig',
28
+ '--pig-versions', pig_version,
29
+ ]
30
+ }
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,21 @@
1
+ module RP
2
+ module EMR
3
+ class Step
4
+ extend Assembler
5
+
6
+ assemble_from(
7
+ :name,
8
+ action_on_failure: nil,
9
+ hadoop_jar_step: nil,
10
+ )
11
+
12
+ def to_hash
13
+ {
14
+ name: name,
15
+ action_on_failure: action_on_failure,
16
+ hadoop_jar_step: hadoop_jar_step,
17
+ }.reject { |k,v| !v || (v.respond_to?(:empty?) && v.empty?) }
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ module RP
2
+ module EMR
3
+ VERSION = "1.0.3"
4
+ end
5
+ end
data/lib/rp/emr.rb ADDED
@@ -0,0 +1,26 @@
1
+ require 'active_support'
2
+ require 'active_support/core_ext/numeric'
3
+ require 'assembler'
4
+ require 'aws-sdk'
5
+ require 'ostruct'
6
+ require 'pp'
7
+ require 'thor'
8
+
9
+
10
+ module RP
11
+ module EMR
12
+ end
13
+ end
14
+
15
+ require_relative 'emr/cli'
16
+ require_relative 'emr/instance_group'
17
+ require_relative 'emr/instances'
18
+ require_relative 'emr/step'
19
+ require_relative 'emr/job'
20
+ require_relative 'emr/instance_groups'
21
+ require_relative 'emr/bootstrap_action'
22
+ require_relative 'emr/step/pig'
23
+ require_relative 'emr/step/s3_dist_cp'
24
+ require_relative 'emr/step/setup_debugging'
25
+ require_relative 'emr/step/setup_pig'
26
+ require_relative 'emr/step/setup_hive'
data/rp-emr.gemspec ADDED
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'rp/emr/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "rp-emr"
8
+ spec.version = RP::EMR::VERSION
9
+ spec.authors = ["Ryan Michael", "Andrew Harrison"]
10
+ spec.email = ["ryanmichael@otherinbox.com", 'andrew.harrison@returnpath.com']
11
+ spec.summary = %q{EMR Helpers}
12
+ spec.description = %q{Framework for launching EMR job flows}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "activesupport"
22
+ spec.add_dependency "aws-sdk"
23
+ spec.add_dependency "assembler"
24
+ spec.add_dependency "thor"
25
+
26
+ spec.add_development_dependency "bundler"
27
+ spec.add_development_dependency "rake"
28
+ spec.add_development_dependency "rspec"
29
+ spec.add_development_dependency "pry"
30
+ spec.add_development_dependency "fuubar"
31
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe RP::EMR::BootstrapAction do
4
+ describe "#to_hash" do
5
+ let(:action) do
6
+ RP::EMR::BootstrapAction.new do |a|
7
+ a.name = 'name'
8
+ a.path = 'path'
9
+ a.args = ['args']
10
+ end
11
+ end
12
+
13
+ it "returns hash" do
14
+ expect(action.to_hash).to eq(
15
+ name: 'name',
16
+ script_bootstrap_action: {
17
+ path: 'path',
18
+ args: ['args'],
19
+ },
20
+ )
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+
3
+ describe RP::EMR::InstanceGroup do
4
+ describe "#to_hash" do
5
+ let(:group_args) { {} }
6
+
7
+ let(:group) do
8
+ RP::EMR::InstanceGroup.new(group_args) do |ig|
9
+ ig.instance_role = 'instance_role'
10
+ ig.instance_type = 'instance_type'
11
+ ig.instance_count = 'instance_count'
12
+ end
13
+ end
14
+
15
+ it "returns a hash" do
16
+ expect(group.to_hash).to eq(
17
+ :instance_role=>"instance_role",
18
+ :instance_type=>"instance_type",
19
+ :instance_count=>"instance_count",
20
+ )
21
+ end
22
+
23
+ context "with name" do
24
+ let(:group_args) { {name: 'name'} }
25
+
26
+ it "sets name" do
27
+ expect(group.to_hash[:name]).to eq('name')
28
+ end
29
+ end
30
+
31
+ context "with market" do
32
+ let(:group_args) { {market: 'market'} }
33
+
34
+ it "sets name" do
35
+ expect(group.to_hash[:market]).to eq('market')
36
+ end
37
+ end
38
+
39
+ context "with bid_price" do
40
+ let(:group_args) { {bid_price: 1.0} }
41
+
42
+ it "sets market" do
43
+ expect(group.to_hash[:market]).to eq('SPOT')
44
+ end
45
+
46
+ it "sets bid_price" do
47
+ expect(group.to_hash[:bid_price]).to eq('1.0')
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,106 @@
1
+ require 'spec_helper'
2
+
3
+ describe RP::EMR::InstanceGroups do
4
+ describe "#to_a" do
5
+ let(:group_args) { {} }
6
+
7
+ let(:groups) do
8
+ RP::EMR::InstanceGroups.new(group_args)
9
+ end
10
+
11
+ it "returns hash" do
12
+ expect(groups.to_a).to eq([
13
+ {
14
+ :market=>"ON_DEMAND",
15
+ :instance_role=>"MASTER",
16
+ :instance_type=>"t1.micro",
17
+ :instance_count=>1,
18
+ },
19
+ {
20
+ :market=>"ON_DEMAND",
21
+ :instance_role=>"CORE",
22
+ :instance_type=>"t1.micro",
23
+ :instance_count=>1,
24
+ },
25
+ {
26
+ :market=>"ON_DEMAND",
27
+ :instance_role=>"TASK",
28
+ :instance_type=>"t1.micro",
29
+ :instance_count=>1,
30
+ },
31
+ ])
32
+ end
33
+
34
+ context "with default_instance_type" do
35
+ let(:group_args) { {default_instance_type: 'default_instance_type'} }
36
+
37
+ it "sets instance type" do
38
+ expect(groups.to_a.map { |h| h[:instance_type] }.uniq).to eq(['default_instance_type'])
39
+ end
40
+ end
41
+
42
+ context "with instance_type" do
43
+ let(:group_args) do
44
+ {
45
+ master_instance_type: 'master_instance_type',
46
+ core_instance_type: 'core_instance_type',
47
+ task_instance_type: 'task_instance_type',
48
+ }
49
+ end
50
+
51
+ it "sets instance type" do
52
+ expect(groups.to_a.map { |h| h[:instance_type] }).to eq([
53
+ 'master_instance_type',
54
+ 'core_instance_type',
55
+ 'task_instance_type',
56
+ ])
57
+ end
58
+ end
59
+
60
+ context "with instance_count" do
61
+ let(:group_args) do
62
+ {
63
+ master_instance_count: 1,
64
+ core_instance_count: 2,
65
+ task_instance_count: 3,
66
+ }
67
+ end
68
+
69
+ it "sets instance count" do
70
+ expect(groups.to_a.map { |h| h[:instance_count] }).to eq([1, 2, 3])
71
+ end
72
+ end
73
+
74
+ context "with bid_price" do
75
+ let(:group_args) do
76
+ {
77
+ master_bid_price: 1,
78
+ core_bid_price: 2,
79
+ task_bid_price: 3,
80
+ }
81
+ end
82
+
83
+ it "sets bid price" do
84
+ expect(groups.to_a.map { |h| h[:bid_price] }).to eq(['1', '2', '3'])
85
+ end
86
+
87
+ it "sets market" do
88
+ expect(groups.to_a.map { |h| h[:market] }.uniq).to eq(['SPOT'])
89
+ end
90
+ end
91
+
92
+ context "with market" do
93
+ let(:group_args) do
94
+ {
95
+ master_market: 'master_market',
96
+ core_market: 'core_market',
97
+ task_market: 'task_market',
98
+ }
99
+ end
100
+
101
+ it "sets market" do
102
+ expect(groups.to_a.map { |h| h[:market] }).to eq(['master_market', 'core_market', 'task_market'])
103
+ end
104
+ end
105
+ end
106
+ end