rubadoop 0.7.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +43 -0
- data/README.rdoc +7 -0
- data/Rakefile +25 -0
- data/lib/rubadoop.rb +26 -0
- data/lib/rubadoop/base_dsl.rb +31 -0
- data/lib/rubadoop/emr.rb +23 -0
- data/lib/rubadoop/emr/jobflow_builder.rb +36 -0
- data/lib/rubadoop/emr/jobflow_builder/bootstrap_action.rb +27 -0
- data/lib/rubadoop/emr/jobflow_builder/job_spec.rb +77 -0
- data/lib/rubadoop/emr/jobflow_builder/step.rb +41 -0
- data/lib/rubadoop/map_reduce.rb +23 -0
- data/lib/rubadoop/map_reduce/call_java.rb +112 -0
- data/lib/rubadoop/map_reduce/call_streaming.rb +55 -0
- data/lib/rubadoop/map_reduce/identity.rb +30 -0
- data/lib/rubadoop/map_reduce/io.rb +128 -0
- data/lib/rubadoop/map_reduce/job_conf_environment.rb +9 -0
- data/lib/rubadoop/map_reduce/mappable.rb +59 -0
- data/lib/rubadoop/map_reduce/mapper.rb +15 -0
- data/lib/rubadoop/map_reduce/reducable.rb +74 -0
- data/lib/rubadoop/map_reduce/reducer.rb +12 -0
- data/lib/rubadoop/map_reduce/test_assist.rb +65 -0
- data/lib/rubadoop/map_reduce/utils.rb +29 -0
- data/lib/rubadoop/oozie/workflow_builder.rb +42 -0
- data/lib/rubadoop/oozie/workflow_builder/job_properties.rb +19 -0
- data/lib/rubadoop/version.rb +3 -0
- data/test/rubadoop/base_dsl_test.rb +27 -0
- data/test/rubadoop/emr/jobflow_builder_test.rb +184 -0
- data/test/rubadoop/map_reduce/call_java_test.rb +122 -0
- data/test/rubadoop/map_reduce/call_streaming_test.rb +81 -0
- data/test/rubadoop/map_reduce/identity_test.rb +40 -0
- data/test/rubadoop/map_reduce/io_test.rb +51 -0
- data/test/rubadoop/map_reduce/job_conf_environment_test.rb +28 -0
- data/test/rubadoop/map_reduce/mappable_test.rb +62 -0
- data/test/rubadoop/map_reduce/mapper_test.rb +76 -0
- data/test/rubadoop/map_reduce/reducable_test.rb +12 -0
- data/test/rubadoop/map_reduce/reducer_test.rb +137 -0
- data/test/rubadoop/map_reduce/test_assist_test.rb +76 -0
- data/test/rubadoop/oozie/workflow_builder_test.rb +21 -0
- data/test/test_helper.rb +10 -0
- metadata +140 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e877fa9607f54419f1a7cd9d2033439465bc7fd1
|
4
|
+
data.tar.gz: 99cd20b45eb293eae96a2836ae0e207b83d38ff3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 22c74443aabfe4b73edba04d89ccd4ce2ddd9eaaa8c3f7db389165a359f566a4d83f5608d051ed1af92700b5ce9b45177a18678155785f8677ee5d13320659ac
|
7
|
+
data.tar.gz: 570d9085f2dc997b908d09d9a12ec3d5041f85e6ae95cb260f3753de6ecab3bb23c2293bcfdd111ad7bbcd5551c9a548746891a00bd1093cf4279ca5307abd54
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
rubadoop (0.7.7)
|
5
|
+
activesupport (>= 3)
|
6
|
+
aws-sdk (>= 1.5.8)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: http://rubygems.org/
|
10
|
+
specs:
|
11
|
+
activesupport (4.0.0)
|
12
|
+
i18n (~> 0.6, >= 0.6.4)
|
13
|
+
minitest (~> 4.2)
|
14
|
+
multi_json (~> 1.3)
|
15
|
+
thread_safe (~> 0.1)
|
16
|
+
tzinfo (~> 0.3.37)
|
17
|
+
atomic (1.1.14)
|
18
|
+
aws-sdk (1.24.0)
|
19
|
+
json (~> 1.4)
|
20
|
+
nokogiri (>= 1.4.4, < 1.6.0)
|
21
|
+
uuidtools (~> 2.1)
|
22
|
+
i18n (0.6.5)
|
23
|
+
json (1.8.1)
|
24
|
+
minitest (4.7.5)
|
25
|
+
multi_json (1.8.2)
|
26
|
+
nokogiri (1.5.10)
|
27
|
+
rdoc (4.0.0)
|
28
|
+
simplecov (0.7.1)
|
29
|
+
multi_json (~> 1.0)
|
30
|
+
simplecov-html (~> 0.7.1)
|
31
|
+
simplecov-html (0.7.1)
|
32
|
+
thread_safe (0.1.3)
|
33
|
+
atomic
|
34
|
+
tzinfo (0.3.38)
|
35
|
+
uuidtools (2.1.4)
|
36
|
+
|
37
|
+
PLATFORMS
|
38
|
+
ruby
|
39
|
+
|
40
|
+
DEPENDENCIES
|
41
|
+
rdoc
|
42
|
+
rubadoop!
|
43
|
+
simplecov
|
data/README.rdoc
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rdoc/task'
|
4
|
+
|
5
|
+
task default: :test
|
6
|
+
|
7
|
+
task :build do
|
8
|
+
exec 'gem build rubadoop.gemspec'
|
9
|
+
end
|
10
|
+
|
11
|
+
Rake::TestTask.new(:test) do |t|
|
12
|
+
t.libs << 'lib'
|
13
|
+
t.libs << 'test'
|
14
|
+
t.pattern = 'test/**/*_test.rb'
|
15
|
+
t.verbose = true
|
16
|
+
end
|
17
|
+
|
18
|
+
desc 'Generate documentation.'
|
19
|
+
RDoc::Task.new(:rdoc) do |rdoc|
|
20
|
+
rdoc.rdoc_dir = 'rdoc'
|
21
|
+
rdoc.title = 'Rubadoop'
|
22
|
+
rdoc.options << '--line-numbers' << '--inline-source'
|
23
|
+
rdoc.rdoc_files.include('README.rdoc')
|
24
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
25
|
+
end
|
data/lib/rubadoop.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'active_support/dependencies/autoload'
|
2
|
+
require 'active_support/concern'
|
3
|
+
require 'active_support/core_ext/class/attribute'
|
4
|
+
require 'active_support/inflector'
|
5
|
+
|
6
|
+
module Rubadoop
|
7
|
+
extend ActiveSupport::Autoload
|
8
|
+
|
9
|
+
autoload :BaseDsl
|
10
|
+
autoload :MapReduce
|
11
|
+
|
12
|
+
module Emr
|
13
|
+
extend ActiveSupport::Autoload
|
14
|
+
|
15
|
+
autoload :JobflowBuilder
|
16
|
+
end
|
17
|
+
|
18
|
+
module Oozie
|
19
|
+
extend ActiveSupport::Autoload
|
20
|
+
|
21
|
+
autoload :WorkflowBuilder
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
require 'rubadoop/version'
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
2
|
+
|
3
|
+
module Rubadoop
|
4
|
+
class BaseDsl
|
5
|
+
attr_reader :params
|
6
|
+
|
7
|
+
def initialize(params)
|
8
|
+
|
9
|
+
unless params.singleton_methods.member? :[]
|
10
|
+
params = params ? params.dup.with_indifferent_access : {}.with_indifferent_access
|
11
|
+
|
12
|
+
def params.[](param)
|
13
|
+
value = super(param)
|
14
|
+
raise "Missing param: #{param}" if value.nil?
|
15
|
+
value
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
@params = params
|
20
|
+
end
|
21
|
+
|
22
|
+
def has_param?(param)
|
23
|
+
params.has_key? param
|
24
|
+
end
|
25
|
+
|
26
|
+
def optional_param(param, default_value)
|
27
|
+
params[param] = default_value unless has_param? param
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
data/lib/rubadoop/emr.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module Emr
|
3
|
+
JobFlowStates = {
|
4
|
+
:starting => 'STARTING',
|
5
|
+
:running => 'RUNNING',
|
6
|
+
:waiting => 'WAITING',
|
7
|
+
:shutting_down => 'SHUTTING_DOWN',
|
8
|
+
:terminated => 'TERMINATED',
|
9
|
+
:ended => 'ENDED'
|
10
|
+
}
|
11
|
+
InstanceRoles = {
|
12
|
+
:master => 'MASTER',
|
13
|
+
:core => 'CORE',
|
14
|
+
:task => 'TASK'
|
15
|
+
}
|
16
|
+
StepActionOnFailure = {
|
17
|
+
:terminate_job_flow => 'TERMINATE_JOB_FLOW',
|
18
|
+
:cancel_and_wait => 'CANCEL_AND_WAIT',
|
19
|
+
:continue => 'CONTINUE'
|
20
|
+
}
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module Emr
|
3
|
+
module JobflowBuilder
|
4
|
+
extend ActiveSupport::Autoload
|
5
|
+
|
6
|
+
autoload :JobSpec
|
7
|
+
autoload :Step
|
8
|
+
autoload :BootstrapAction
|
9
|
+
|
10
|
+
class << self
|
11
|
+
def new_job_spec(__params__ = {}, &block)
|
12
|
+
job_spec = JobSpec.new(__params__)
|
13
|
+
|
14
|
+
if block_given?
|
15
|
+
if block.arity == 1
|
16
|
+
yield job_spec
|
17
|
+
else
|
18
|
+
job_spec.instance_eval &block
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
job_spec
|
23
|
+
end
|
24
|
+
|
25
|
+
def load_job_spec(__params__ = {}, __spec_code__)
|
26
|
+
new_job_spec(__params__) do |dsl|
|
27
|
+
dsl.instance_eval __spec_code__
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module Emr
|
3
|
+
module JobflowBuilder
|
4
|
+
class BootstrapAction < Rubadoop::BaseDsl
|
5
|
+
attr_accessor :args, :name, :path
|
6
|
+
|
7
|
+
def initialize(params, path)
|
8
|
+
super(params)
|
9
|
+
@path = path
|
10
|
+
end
|
11
|
+
|
12
|
+
def arg(*value)
|
13
|
+
@args ||= []
|
14
|
+
@args.concat value
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_h
|
18
|
+
built = {}
|
19
|
+
built[:name] = @name || "Bootstrap Action"
|
20
|
+
built[:script_bootstrap_action] = {:path => @path}
|
21
|
+
built[:script_bootstrap_action][:args] = @args unless @args.nil?
|
22
|
+
built
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module Emr
|
3
|
+
module JobflowBuilder
|
4
|
+
class JobSpec < Rubadoop::BaseDsl
|
5
|
+
attr_accessor :additional_info, :ami_version, :bootstrap_actions, :instances, :log_uri, :name, :steps, :supported_products
|
6
|
+
attr_accessor :termination_protected, :visible_to_all_users
|
7
|
+
|
8
|
+
STREAMING_JAR_LOCATION = '/home/hadoop/contrib/streaming/hadoop-streaming.jar'
|
9
|
+
|
10
|
+
def with_bootstrap_action(path, &block)
|
11
|
+
builder = BootstrapAction.new(params, path)
|
12
|
+
if block_given?
|
13
|
+
if block.arity == 1
|
14
|
+
yield builder
|
15
|
+
else
|
16
|
+
builder.instance_eval &block
|
17
|
+
end
|
18
|
+
end
|
19
|
+
(@bootstrap_actions ||= []) << builder.to_h
|
20
|
+
end
|
21
|
+
|
22
|
+
def with_instances(master_type, core_type, core_count)
|
23
|
+
@instances ||= {}
|
24
|
+
@instances[:master_instance_type] = master_type
|
25
|
+
@instances[:slave_instance_type] = core_type
|
26
|
+
@instances[:instance_count] = core_count + 1
|
27
|
+
end
|
28
|
+
|
29
|
+
def keep_alive(value)
|
30
|
+
raise 'keep_alive value must be true/false' unless !!value == value
|
31
|
+
@instances ||= {}
|
32
|
+
@instances[:keep_job_flow_alive_when_no_steps] = value
|
33
|
+
end
|
34
|
+
|
35
|
+
def add_jar_step(name, jar, main_class = nil, &block)
|
36
|
+
builder = Step::Java.new(params, jar, main_class)
|
37
|
+
builder.name = name
|
38
|
+
if block_given?
|
39
|
+
if block.arity == 1
|
40
|
+
yield builder
|
41
|
+
else
|
42
|
+
builder.instance_eval &block
|
43
|
+
end
|
44
|
+
end
|
45
|
+
(@steps ||= []) << builder.to_h
|
46
|
+
end
|
47
|
+
|
48
|
+
def add_streaming_step(name, &block)
|
49
|
+
builder = Step::Streaming.new(params, STREAMING_JAR_LOCATION)
|
50
|
+
builder.name = name
|
51
|
+
if block_given?
|
52
|
+
if block.arity == 1
|
53
|
+
yield builder
|
54
|
+
else
|
55
|
+
builder.instance_eval &block
|
56
|
+
end
|
57
|
+
end
|
58
|
+
(@steps ||= []) << builder.to_h
|
59
|
+
end
|
60
|
+
|
61
|
+
def to_create_command
|
62
|
+
built = {}
|
63
|
+
[:additional_info, :ami_version, :log_uri, :name, :bootstrap_actions, :instances, :supported_products].each do |attr|
|
64
|
+
attr_value = self.instance_variable_get("@#{attr}")
|
65
|
+
built[attr] = attr_value unless attr_value.nil?
|
66
|
+
end
|
67
|
+
|
68
|
+
built
|
69
|
+
end
|
70
|
+
|
71
|
+
def to_steps_command
|
72
|
+
@steps
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module Emr
|
3
|
+
module JobflowBuilder
|
4
|
+
module Step
|
5
|
+
|
6
|
+
private
|
7
|
+
module StepBehavior
|
8
|
+
attr_accessor :name, :action_on_failure
|
9
|
+
|
10
|
+
def initialize(params, jar, main_class = nil)
|
11
|
+
super(params)
|
12
|
+
@name = name
|
13
|
+
@jar = jar
|
14
|
+
@main_class = main_class if main_class
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_h
|
18
|
+
built = {}
|
19
|
+
built[:name] = @name unless @name.nil?
|
20
|
+
built[:hadoop_jar_step] = {}
|
21
|
+
built[:hadoop_jar_step][:jar] = @jar
|
22
|
+
built[:hadoop_jar_step][:main_class] = @main_class unless @main_class.nil?
|
23
|
+
built[:hadoop_jar_step][:args] = to_hadoop_cli(skip_jar: true)
|
24
|
+
built[:action_on_failure] = @action_on_failure unless @action_on_failure.nil?
|
25
|
+
built
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
public
|
30
|
+
class Streaming < Rubadoop::MapReduce::CallStreaming
|
31
|
+
include StepBehavior
|
32
|
+
end
|
33
|
+
|
34
|
+
class Java < Rubadoop::MapReduce::CallJava
|
35
|
+
include StepBehavior
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rubadoop/map_reduce/io'
|
2
|
+
require 'rubadoop/map_reduce/utils'
|
3
|
+
|
4
|
+
module Rubadoop
|
5
|
+
module MapReduce
|
6
|
+
extend ActiveSupport::Autoload
|
7
|
+
|
8
|
+
autoload :CallJava
|
9
|
+
autoload :CallStreaming
|
10
|
+
autoload :JobConfEnvironment
|
11
|
+
autoload :Mappable
|
12
|
+
autoload :Mapper
|
13
|
+
autoload :Reducable
|
14
|
+
autoload :Reducer
|
15
|
+
autoload :Identity
|
16
|
+
autoload :TestAssist
|
17
|
+
|
18
|
+
extend Io
|
19
|
+
extend Utils
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
@@ -0,0 +1,112 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module MapReduce
|
3
|
+
class CallJava < BaseDsl
|
4
|
+
attr_accessor :jar, :main_class, :envs, :args, :confs, :files, :archives
|
5
|
+
|
6
|
+
class << self
|
7
|
+
def new_java_call(params = {}, &block)
|
8
|
+
builder = CallJava.new(params)
|
9
|
+
if block_given?
|
10
|
+
if block.arity == 1
|
11
|
+
yield builder
|
12
|
+
else
|
13
|
+
builder.instance_eval &block
|
14
|
+
end
|
15
|
+
end
|
16
|
+
builder
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def initialize(params = {})
|
21
|
+
super(params)
|
22
|
+
end
|
23
|
+
|
24
|
+
def env(name, value)
|
25
|
+
@envs ||= {}
|
26
|
+
@envs[name.to_sym] = value
|
27
|
+
end
|
28
|
+
|
29
|
+
def conf(name, value)
|
30
|
+
@confs ||= {}
|
31
|
+
@confs[name.to_sym] = value
|
32
|
+
end
|
33
|
+
|
34
|
+
def conf_concat(name, value)
|
35
|
+
@confs ||= {}
|
36
|
+
prev_value = @confs[name.to_sym]
|
37
|
+
if prev_value
|
38
|
+
if prev_value.kind_of?(Array)
|
39
|
+
prev_value << value
|
40
|
+
else
|
41
|
+
@confs[name.to_sym] = [prev_value, value]
|
42
|
+
end
|
43
|
+
else
|
44
|
+
@confs[name.to_sym] = value
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def arg(*value)
|
49
|
+
@args ||= []
|
50
|
+
@args.concat value
|
51
|
+
end
|
52
|
+
|
53
|
+
def file(location, symlink)
|
54
|
+
@files ||= []
|
55
|
+
@files << "#{location}##{symlink}"
|
56
|
+
end
|
57
|
+
|
58
|
+
def archive(location, symlink)
|
59
|
+
@archives ||= []
|
60
|
+
@archives << "#{location}##{symlink}"
|
61
|
+
end
|
62
|
+
|
63
|
+
def to_hadoop_cli(opts = {})
|
64
|
+
validate
|
65
|
+
cmd = []
|
66
|
+
unless opts[:skip_jar]
|
67
|
+
cmd.concat ["hadoop", "jar", jar]
|
68
|
+
cmd << @main_class unless @main_class.nil?
|
69
|
+
end
|
70
|
+
|
71
|
+
@confs.each { |key, value|
|
72
|
+
if value.kind_of?(Array)
|
73
|
+
value.each { |entry|
|
74
|
+
cmd. << "-D#{key}=#{entry}"
|
75
|
+
}
|
76
|
+
else
|
77
|
+
cmd. << "-D#{key}=#{value}"
|
78
|
+
end
|
79
|
+
} if @confs
|
80
|
+
@envs.each { |key, value|
|
81
|
+
cmd.concat ['-cmdenv', "#{key}=#{value}"]
|
82
|
+
} if @envs
|
83
|
+
@files.each { |entry|
|
84
|
+
cmd.concat ['-cacheFile', "#{entry}"]
|
85
|
+
} if @files
|
86
|
+
@archives.each { |entry|
|
87
|
+
cmd.concat ['-cacheArchive', "#{entry}"]
|
88
|
+
} if @archives
|
89
|
+
unless opts[:skip_args]
|
90
|
+
cmd.concat @args if @args
|
91
|
+
end
|
92
|
+
cmd
|
93
|
+
end
|
94
|
+
|
95
|
+
def to_h
|
96
|
+
validate
|
97
|
+
built = {}
|
98
|
+
[:@jar, :@main_class, :@envs, :@args, :@confs, :@files, :@archives].each { |entry|
|
99
|
+
built[entry.to_s.delete("@").to_sym] = instance_variable_get(entry)
|
100
|
+
}
|
101
|
+
built
|
102
|
+
end
|
103
|
+
|
104
|
+
def validate
|
105
|
+
[:@jar].each { |property|
|
106
|
+
raise "Missing #{property}" if instance_variable_get(property).nil?
|
107
|
+
}
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|