rp-emr 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,66 @@
1
+ module RP
2
+ module EMR
3
+ class InstanceGroups
4
+ extend Assembler
5
+
6
+ assemble_from(
7
+ default_instance_type: 't1.micro',
8
+
9
+ master_instance_type: nil,
10
+ master_instance_count: 1,
11
+ master_market: 'ON_DEMAND',
12
+ master_bid_price: nil,
13
+
14
+ core_instance_type: nil,
15
+ core_instance_count: 1,
16
+ core_market: 'ON_DEMAND',
17
+ core_bid_price: nil,
18
+
19
+ task_instance_type: nil,
20
+ task_instance_count: 1,
21
+ task_market: 'ON_DEMAND',
22
+ task_bid_price: nil,
23
+ )
24
+
25
+ def to_a
26
+ [
27
+ master_instance_group.to_hash,
28
+ core_instance_group.to_hash,
29
+ task_instance_group.to_hash,
30
+ ].reject { |h| h[:instance_count] == 0 }
31
+ end
32
+
33
+ private
34
+
35
+ def master_instance_group
36
+ RP::EMR::InstanceGroup.new do |ig|
37
+ ig.instance_role = 'MASTER'
38
+ ig.instance_type = master_instance_type || default_instance_type
39
+ ig.instance_count = master_instance_count
40
+ ig.market = master_market
41
+ ig.bid_price = master_bid_price
42
+ end
43
+ end
44
+
45
+ def core_instance_group
46
+ RP::EMR::InstanceGroup.new do |ig|
47
+ ig.instance_role = 'CORE'
48
+ ig.instance_type = core_instance_type || default_instance_type
49
+ ig.instance_count = core_instance_count
50
+ ig.market = core_market
51
+ ig.bid_price = core_bid_price
52
+ end
53
+ end
54
+
55
+ def task_instance_group
56
+ RP::EMR::InstanceGroup.new do |ig|
57
+ ig.instance_role = 'TASK'
58
+ ig.instance_type = task_instance_type || default_instance_type
59
+ ig.instance_count = task_instance_count
60
+ ig.market = task_market
61
+ ig.bid_price = task_bid_price
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,36 @@
1
+ module RP
2
+ module EMR
3
+ class Instances
4
+ extend Assembler
5
+
6
+ assemble_from(
7
+ # Optional params
8
+ master_instance_type: nil,
9
+ slave_instance_type: nil,
10
+ instance_count: nil,
11
+ instance_groups: nil,
12
+ ec2_key_name: nil,
13
+ placement: nil,
14
+ keep_job_flow_alive_when_no_steps: nil,
15
+ termination_protected: nil,
16
+ hadoop_version: nil,
17
+ ec2_subnet_id: nil,
18
+ )
19
+
20
+ def to_hash
21
+ {
22
+ master_instance_type: master_instance_type,
23
+ slave_instance_type: slave_instance_type,
24
+ instance_count: instance_count,
25
+ instance_groups: instance_groups,
26
+ ec2_key_name: ec2_key_name,
27
+ placement: placement,
28
+ keep_job_flow_alive_when_no_steps: keep_job_flow_alive_when_no_steps,
29
+ termination_protected: termination_protected,
30
+ hadoop_version: hadoop_version,
31
+ ec2_subnet_id: ec2_subnet_id,
32
+ }.reject { |k,v| !v || (v.respond_to?(:empty?) && v.empty?) }
33
+ end
34
+ end
35
+ end
36
+ end
data/lib/rp/emr/job.rb ADDED
@@ -0,0 +1,42 @@
1
+ module RP
2
+ module EMR
3
+ class Job
4
+ extend Assembler
5
+
6
+ assemble_from(
7
+ # Required params
8
+ :instances,
9
+
10
+ # Optional params
11
+ steps: nil,
12
+ log_uri: nil,
13
+ additional_info: nil,
14
+ ami_version: :latest,
15
+ bootstrap_actions: nil,
16
+ supported_products: nil,
17
+ new_supported_products: nil,
18
+ visible_to_all_users: true,
19
+ job_flow_role: nil,
20
+ service_role: nil,
21
+ tags: nil,
22
+ )
23
+
24
+ def to_hash
25
+ {
26
+ instances: instances,
27
+ log_uri: log_uri,
28
+ additional_info: additional_info,
29
+ ami_version: ami_version.to_s,
30
+ steps: steps,
31
+ bootstrap_actions: bootstrap_actions,
32
+ supported_products: supported_products,
33
+ new_supported_products: new_supported_products,
34
+ visible_to_all_users: visible_to_all_users,
35
+ job_flow_role: job_flow_role,
36
+ service_role: service_role,
37
+ tags: tags,
38
+ }.reject { |k,v| !v || (v.respond_to?(:empty?) && v.empty?) }
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,84 @@
1
+ require 'digest/md5'
2
+
3
+ module RP
4
+ module EMR
5
+ class Step
6
+ class Pig
7
+ extend Assembler
8
+
9
+ assemble_from(
10
+ :name,
11
+ :script_path,
12
+ :script_bucket,
13
+ args: [],
14
+ pig_params: {},
15
+ pig_version: '0.11.1.1',
16
+ action_on_failure: nil,
17
+ dry_run: false,
18
+ )
19
+
20
+ def to_hash
21
+ @hash ||= begin
22
+ upload_script! unless dry_run
23
+ step.to_hash
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def upload_script!
30
+ # puts "Uploading to s3://#{script_bucket}/#{script_key}"
31
+ s3.buckets[script_bucket].objects[script_key].write(script)
32
+ end
33
+
34
+ def script
35
+ @script ||= File.open(script_path, 'r').read
36
+ end
37
+
38
+ def script_key
39
+ @script_key ||= begin
40
+ hash = Digest::MD5.hexdigest(script)
41
+ "scripts/emr_gem/#{File.basename(script_path, '.pig')}_#{hash}.pig"
42
+ end
43
+ end
44
+
45
+ def script_url
46
+ "s3://#{script_bucket}/#{script_key}"
47
+ end
48
+
49
+ def step
50
+ RP::EMR::Step.new(
51
+ name: name,
52
+ action_on_failure: action_on_failure,
53
+ hadoop_jar_step: {
54
+ jar: 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
55
+ args: hadoop_jar_base_args + args + formatted_params,
56
+ }
57
+ )
58
+ end
59
+
60
+ def hadoop_jar_base_args
61
+ [
62
+ 's3://us-east-1.elasticmapreduce/libs/pig/pig-script',
63
+ '--base-path', 's3://us-east-1.elasticmapreduce/libs/pig/',
64
+ '--pig-versions', pig_version,
65
+ '--run-pig-script',
66
+ ]
67
+ end
68
+
69
+ def formatted_params
70
+ [
71
+ '--args',
72
+ '-f', script_url,
73
+ ] + pig_params.
74
+ reject { |k, v| v.nil? }.
75
+ flat_map { |k, v| ['-p', "#{k}=#{v}"] }
76
+ end
77
+
78
+ def s3
79
+ AWS::S3.new
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,93 @@
1
+ module RP
2
+ module EMR
3
+ class Step
4
+ # Create a S3DistCp step
5
+ # http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/UsingEMR_s3distcp.html
6
+ #
7
+ # Handles annoying details like argument escaping
8
+ #
9
+ # Accepts all the input parameters listed in the documentation as of version 1.0.7.
10
+ #
11
+ # @example
12
+ # step = S3DistCpStep.new do |s|
13
+ # s.src = 's3://bucket/input/prefix/' # Note this is NOT Hadoop's glob syntax
14
+ # s.dest = 's3://bucket/output/path'
15
+ # s.srcPattern = 's3://bucket/input/prefix/[foo|bar].*\.eml' # Input regex - see Java's regex docs
16
+ # s.groupBy = '.*([a-z0-9]{2}).tsv' # Note that you need a capture group
17
+ # s.targetSize = 120.megabytes
18
+ # s.compression = 'snappy'
19
+ # s.deleteOnSuccess = true
20
+ # end
21
+ #
22
+ # step.to_hash # => Ruby hash ready for use in :steps key of a job
23
+ #
24
+ class S3DistCp
25
+ extend Assembler
26
+
27
+ DEFAULT_S3_DISTCP_JAR = '/home/hadoop/lib/emr-s3distcp-1.0.jar'
28
+
29
+ HASH_FIELDS = [
30
+ :src,
31
+ :dest,
32
+ :groupBy,
33
+ :targetSize,
34
+ :outputCodec,
35
+ :multipartUploadChunkSize,
36
+ :startingIndex,
37
+ :outputManifest,
38
+ :previousManifest,
39
+ :s3Endpoint,
40
+ :srcPattern,
41
+ ]
42
+
43
+ BOOLEAN_FIELDS = [
44
+ :s3ServerSideEncryption,
45
+ :deleteOnSuccess,
46
+ :disableMultipartUpload,
47
+ :numberFiles,
48
+ :copyFromManifest,
49
+ ]
50
+
51
+ assemble_from(name: 'S3DistCp',
52
+ action_on_failure: nil,
53
+ s3_distcp_jar: DEFAULT_S3_DISTCP_JAR)
54
+ assemble_from(Hash[HASH_FIELDS.map { |f| [f, nil] }])
55
+ assemble_from(Hash[BOOLEAN_FIELDS.map { |f| [f, false] }])
56
+
57
+ def to_hash
58
+ step.to_hash
59
+ end
60
+
61
+ private
62
+
63
+ def step
64
+ RP::EMR::Step.new(
65
+ name: name,
66
+ action_on_failure: action_on_failure,
67
+ hadoop_jar_step: {
68
+ jar: s3_distcp_jar,
69
+ args: hash_field_args + boolean_fields_args,
70
+ }
71
+ )
72
+ end
73
+
74
+ def hash_field_args
75
+ HASH_FIELDS.each do |f|
76
+ raise ArgumentError, "I don't know how to handle whitespace" if send(f) =~ / /
77
+ end
78
+
79
+ HASH_FIELDS.
80
+ map { |f| [f, send(f)] }.
81
+ reject { |k, v| v.nil? }.
82
+ flat_map { |k, v| ["--#{k}", v.to_s] }
83
+ end
84
+
85
+ def boolean_fields_args
86
+ BOOLEAN_FIELDS.
87
+ reject { |f| !send(f) }.
88
+ map { |f| "--#{f}" }
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,28 @@
1
+ module RP
2
+ module EMR
3
+ class Step
4
+ class SetupDebugging
5
+ extend Assembler
6
+
7
+ assemble_from action_on_failure: nil
8
+
9
+ def to_hash
10
+ step.to_hash
11
+ end
12
+
13
+ private
14
+
15
+ def step
16
+ RP::EMR::Step.new(
17
+ name: "Setup Hadoop Debugging",
18
+ action_on_failure: action_on_failure,
19
+ hadoop_jar_step: {
20
+ jar: 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
21
+ args: ['s3://us-east-1.elasticmapreduce/libs/state-pusher/0.1/fetch'],
22
+ }
23
+ )
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,36 @@
1
+ module RP
2
+ module EMR
3
+ class Step
4
+ class SetupHive
5
+ extend Assembler
6
+
7
+ assemble_from(
8
+ hive_version: 'latest',
9
+ action_on_failure: nil,
10
+ )
11
+
12
+ def to_hash
13
+ step.to_hash
14
+ end
15
+
16
+ private
17
+
18
+ def step
19
+ RP::EMR::Step.new(
20
+ name: "Setup Hive",
21
+ action_on_failure: action_on_failure,
22
+ hadoop_jar_step: {
23
+ jar: 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
24
+ args: [
25
+ 's3://us-east-1.elasticmapreduce/libs/hive/hive-script',
26
+ '--base-path', 's3://us-east-1.elasticmapreduce/libs/hive/',
27
+ '--install-hive',
28
+ '--hive-versions', hive_version,
29
+ ]
30
+ }
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,36 @@
1
+ module RP
2
+ module EMR
3
+ class Step
4
+ class SetupPig
5
+ extend Assembler
6
+
7
+ assemble_from(
8
+ pig_version: '0.11.1.1',
9
+ action_on_failure: nil,
10
+ )
11
+
12
+ def to_hash
13
+ step.to_hash
14
+ end
15
+
16
+ private
17
+
18
+ def step
19
+ RP::EMR::Step.new(
20
+ name: "Setup Pig",
21
+ action_on_failure: action_on_failure,
22
+ hadoop_jar_step: {
23
+ jar: 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
24
+ args: [
25
+ 's3://us-east-1.elasticmapreduce/libs/pig/pig-script',
26
+ '--base-path', 's3://us-east-1.elasticmapreduce/libs/pig/',
27
+ '--install-pig',
28
+ '--pig-versions', pig_version,
29
+ ]
30
+ }
31
+ )
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,21 @@
1
+ module RP
2
+ module EMR
3
+ class Step
4
+ extend Assembler
5
+
6
+ assemble_from(
7
+ :name,
8
+ action_on_failure: nil,
9
+ hadoop_jar_step: nil,
10
+ )
11
+
12
+ def to_hash
13
+ {
14
+ name: name,
15
+ action_on_failure: action_on_failure,
16
+ hadoop_jar_step: hadoop_jar_step,
17
+ }.reject { |k,v| !v || (v.respond_to?(:empty?) && v.empty?) }
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ module RP
2
+ module EMR
3
+ VERSION = "1.0.3"
4
+ end
5
+ end
data/lib/rp/emr.rb ADDED
@@ -0,0 +1,26 @@
1
+ require 'active_support'
2
+ require 'active_support/core_ext/numeric'
3
+ require 'assembler'
4
+ require 'aws-sdk'
5
+ require 'ostruct'
6
+ require 'pp'
7
+ require 'thor'
8
+
9
+
10
+ module RP
11
+ module EMR
12
+ end
13
+ end
14
+
15
+ require_relative 'emr/cli'
16
+ require_relative 'emr/instance_group'
17
+ require_relative 'emr/instances'
18
+ require_relative 'emr/step'
19
+ require_relative 'emr/job'
20
+ require_relative 'emr/instance_groups'
21
+ require_relative 'emr/bootstrap_action'
22
+ require_relative 'emr/step/pig'
23
+ require_relative 'emr/step/s3_dist_cp'
24
+ require_relative 'emr/step/setup_debugging'
25
+ require_relative 'emr/step/setup_pig'
26
+ require_relative 'emr/step/setup_hive'
data/rp-emr.gemspec ADDED
@@ -0,0 +1,31 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'rp/emr/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "rp-emr"
8
+ spec.version = RP::EMR::VERSION
9
+ spec.authors = ["Ryan Michael", "Andrew Harrison"]
10
+ spec.email = ["ryanmichael@otherinbox.com", 'andrew.harrison@returnpath.com']
11
+ spec.summary = %q{EMR Helpers}
12
+ spec.description = %q{Framework for launching EMR job flows}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "activesupport"
22
+ spec.add_dependency "aws-sdk"
23
+ spec.add_dependency "assembler"
24
+ spec.add_dependency "thor"
25
+
26
+ spec.add_development_dependency "bundler"
27
+ spec.add_development_dependency "rake"
28
+ spec.add_development_dependency "rspec"
29
+ spec.add_development_dependency "pry"
30
+ spec.add_development_dependency "fuubar"
31
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe RP::EMR::BootstrapAction do
4
+ describe "#to_hash" do
5
+ let(:action) do
6
+ RP::EMR::BootstrapAction.new do |a|
7
+ a.name = 'name'
8
+ a.path = 'path'
9
+ a.args = ['args']
10
+ end
11
+ end
12
+
13
+ it "returns hash" do
14
+ expect(action.to_hash).to eq(
15
+ name: 'name',
16
+ script_bootstrap_action: {
17
+ path: 'path',
18
+ args: ['args'],
19
+ },
20
+ )
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+
3
+ describe RP::EMR::InstanceGroup do
4
+ describe "#to_hash" do
5
+ let(:group_args) { {} }
6
+
7
+ let(:group) do
8
+ RP::EMR::InstanceGroup.new(group_args) do |ig|
9
+ ig.instance_role = 'instance_role'
10
+ ig.instance_type = 'instance_type'
11
+ ig.instance_count = 'instance_count'
12
+ end
13
+ end
14
+
15
+ it "returns a hash" do
16
+ expect(group.to_hash).to eq(
17
+ :instance_role=>"instance_role",
18
+ :instance_type=>"instance_type",
19
+ :instance_count=>"instance_count",
20
+ )
21
+ end
22
+
23
+ context "with name" do
24
+ let(:group_args) { {name: 'name'} }
25
+
26
+ it "sets name" do
27
+ expect(group.to_hash[:name]).to eq('name')
28
+ end
29
+ end
30
+
31
+ context "with market" do
32
+ let(:group_args) { {market: 'market'} }
33
+
34
+ it "sets name" do
35
+ expect(group.to_hash[:market]).to eq('market')
36
+ end
37
+ end
38
+
39
+ context "with bid_price" do
40
+ let(:group_args) { {bid_price: 1.0} }
41
+
42
+ it "sets market" do
43
+ expect(group.to_hash[:market]).to eq('SPOT')
44
+ end
45
+
46
+ it "sets bid_price" do
47
+ expect(group.to_hash[:bid_price]).to eq('1.0')
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,106 @@
1
+ require 'spec_helper'
2
+
3
+ describe RP::EMR::InstanceGroups do
4
+ describe "#to_a" do
5
+ let(:group_args) { {} }
6
+
7
+ let(:groups) do
8
+ RP::EMR::InstanceGroups.new(group_args)
9
+ end
10
+
11
+ it "returns hash" do
12
+ expect(groups.to_a).to eq([
13
+ {
14
+ :market=>"ON_DEMAND",
15
+ :instance_role=>"MASTER",
16
+ :instance_type=>"t1.micro",
17
+ :instance_count=>1,
18
+ },
19
+ {
20
+ :market=>"ON_DEMAND",
21
+ :instance_role=>"CORE",
22
+ :instance_type=>"t1.micro",
23
+ :instance_count=>1,
24
+ },
25
+ {
26
+ :market=>"ON_DEMAND",
27
+ :instance_role=>"TASK",
28
+ :instance_type=>"t1.micro",
29
+ :instance_count=>1,
30
+ },
31
+ ])
32
+ end
33
+
34
+ context "with default_instance_type" do
35
+ let(:group_args) { {default_instance_type: 'default_instance_type'} }
36
+
37
+ it "sets instance type" do
38
+ expect(groups.to_a.map { |h| h[:instance_type] }.uniq).to eq(['default_instance_type'])
39
+ end
40
+ end
41
+
42
+ context "with instance_type" do
43
+ let(:group_args) do
44
+ {
45
+ master_instance_type: 'master_instance_type',
46
+ core_instance_type: 'core_instance_type',
47
+ task_instance_type: 'task_instance_type',
48
+ }
49
+ end
50
+
51
+ it "sets instance type" do
52
+ expect(groups.to_a.map { |h| h[:instance_type] }).to eq([
53
+ 'master_instance_type',
54
+ 'core_instance_type',
55
+ 'task_instance_type',
56
+ ])
57
+ end
58
+ end
59
+
60
+ context "with instance_count" do
61
+ let(:group_args) do
62
+ {
63
+ master_instance_count: 1,
64
+ core_instance_count: 2,
65
+ task_instance_count: 3,
66
+ }
67
+ end
68
+
69
+ it "sets instance count" do
70
+ expect(groups.to_a.map { |h| h[:instance_count] }).to eq([1, 2, 3])
71
+ end
72
+ end
73
+
74
+ context "with bid_price" do
75
+ let(:group_args) do
76
+ {
77
+ master_bid_price: 1,
78
+ core_bid_price: 2,
79
+ task_bid_price: 3,
80
+ }
81
+ end
82
+
83
+ it "sets bid price" do
84
+ expect(groups.to_a.map { |h| h[:bid_price] }).to eq(['1', '2', '3'])
85
+ end
86
+
87
+ it "sets market" do
88
+ expect(groups.to_a.map { |h| h[:market] }.uniq).to eq(['SPOT'])
89
+ end
90
+ end
91
+
92
+ context "with market" do
93
+ let(:group_args) do
94
+ {
95
+ master_market: 'master_market',
96
+ core_market: 'core_market',
97
+ task_market: 'task_market',
98
+ }
99
+ end
100
+
101
+ it "sets market" do
102
+ expect(groups.to_a.map { |h| h[:market] }).to eq(['master_market', 'core_market', 'task_market'])
103
+ end
104
+ end
105
+ end
106
+ end