rubadoop 0.7.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +7 -0
  3. data/Gemfile.lock +43 -0
  4. data/README.rdoc +7 -0
  5. data/Rakefile +25 -0
  6. data/lib/rubadoop.rb +26 -0
  7. data/lib/rubadoop/base_dsl.rb +31 -0
  8. data/lib/rubadoop/emr.rb +23 -0
  9. data/lib/rubadoop/emr/jobflow_builder.rb +36 -0
  10. data/lib/rubadoop/emr/jobflow_builder/bootstrap_action.rb +27 -0
  11. data/lib/rubadoop/emr/jobflow_builder/job_spec.rb +77 -0
  12. data/lib/rubadoop/emr/jobflow_builder/step.rb +41 -0
  13. data/lib/rubadoop/map_reduce.rb +23 -0
  14. data/lib/rubadoop/map_reduce/call_java.rb +112 -0
  15. data/lib/rubadoop/map_reduce/call_streaming.rb +55 -0
  16. data/lib/rubadoop/map_reduce/identity.rb +30 -0
  17. data/lib/rubadoop/map_reduce/io.rb +128 -0
  18. data/lib/rubadoop/map_reduce/job_conf_environment.rb +9 -0
  19. data/lib/rubadoop/map_reduce/mappable.rb +59 -0
  20. data/lib/rubadoop/map_reduce/mapper.rb +15 -0
  21. data/lib/rubadoop/map_reduce/reducable.rb +74 -0
  22. data/lib/rubadoop/map_reduce/reducer.rb +12 -0
  23. data/lib/rubadoop/map_reduce/test_assist.rb +65 -0
  24. data/lib/rubadoop/map_reduce/utils.rb +29 -0
  25. data/lib/rubadoop/oozie/workflow_builder.rb +42 -0
  26. data/lib/rubadoop/oozie/workflow_builder/job_properties.rb +19 -0
  27. data/lib/rubadoop/version.rb +3 -0
  28. data/test/rubadoop/base_dsl_test.rb +27 -0
  29. data/test/rubadoop/emr/jobflow_builder_test.rb +184 -0
  30. data/test/rubadoop/map_reduce/call_java_test.rb +122 -0
  31. data/test/rubadoop/map_reduce/call_streaming_test.rb +81 -0
  32. data/test/rubadoop/map_reduce/identity_test.rb +40 -0
  33. data/test/rubadoop/map_reduce/io_test.rb +51 -0
  34. data/test/rubadoop/map_reduce/job_conf_environment_test.rb +28 -0
  35. data/test/rubadoop/map_reduce/mappable_test.rb +62 -0
  36. data/test/rubadoop/map_reduce/mapper_test.rb +76 -0
  37. data/test/rubadoop/map_reduce/reducable_test.rb +12 -0
  38. data/test/rubadoop/map_reduce/reducer_test.rb +137 -0
  39. data/test/rubadoop/map_reduce/test_assist_test.rb +76 -0
  40. data/test/rubadoop/oozie/workflow_builder_test.rb +21 -0
  41. data/test/test_helper.rb +10 -0
  42. metadata +140 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e877fa9607f54419f1a7cd9d2033439465bc7fd1
4
+ data.tar.gz: 99cd20b45eb293eae96a2836ae0e207b83d38ff3
5
+ SHA512:
6
+ metadata.gz: 22c74443aabfe4b73edba04d89ccd4ce2ddd9eaaa8c3f7db389165a359f566a4d83f5608d051ed1af92700b5ce9b45177a18678155785f8677ee5d13320659ac
7
+ data.tar.gz: 570d9085f2dc997b908d09d9a12ec3d5041f85e6ae95cb260f3753de6ecab3bb23c2293bcfdd111ad7bbcd5551c9a548746891a00bd1093cf4279ca5307abd54
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source "http://rubygems.org"
2
+
3
+ gemspec
4
+
5
+ group :test do
6
+ gem 'simplecov', require: false
7
+ end
@@ -0,0 +1,43 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ rubadoop (0.7.7)
5
+ activesupport (>= 3)
6
+ aws-sdk (>= 1.5.8)
7
+
8
+ GEM
9
+ remote: http://rubygems.org/
10
+ specs:
11
+ activesupport (4.0.0)
12
+ i18n (~> 0.6, >= 0.6.4)
13
+ minitest (~> 4.2)
14
+ multi_json (~> 1.3)
15
+ thread_safe (~> 0.1)
16
+ tzinfo (~> 0.3.37)
17
+ atomic (1.1.14)
18
+ aws-sdk (1.24.0)
19
+ json (~> 1.4)
20
+ nokogiri (>= 1.4.4, < 1.6.0)
21
+ uuidtools (~> 2.1)
22
+ i18n (0.6.5)
23
+ json (1.8.1)
24
+ minitest (4.7.5)
25
+ multi_json (1.8.2)
26
+ nokogiri (1.5.10)
27
+ rdoc (4.0.0)
28
+ simplecov (0.7.1)
29
+ multi_json (~> 1.0)
30
+ simplecov-html (~> 0.7.1)
31
+ simplecov-html (0.7.1)
32
+ thread_safe (0.1.3)
33
+ atomic
34
+ tzinfo (0.3.38)
35
+ uuidtools (2.1.4)
36
+
37
+ PLATFORMS
38
+ ruby
39
+
40
+ DEPENDENCIES
41
+ rdoc
42
+ rubadoop!
43
+ simplecov
@@ -0,0 +1,7 @@
1
+ Rubadoop provides:
2
+
3
+ * Streaming support structure
4
+ * Hadoop Definition DSL
5
+ * Amazon::EMR Definition DSL
6
+ * Oozie Definition DSL
7
+
@@ -0,0 +1,25 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rdoc/task'
4
+
5
+ task default: :test
6
+
7
+ task :build do
8
+ exec 'gem build rubadoop.gemspec'
9
+ end
10
+
11
+ Rake::TestTask.new(:test) do |t|
12
+ t.libs << 'lib'
13
+ t.libs << 'test'
14
+ t.pattern = 'test/**/*_test.rb'
15
+ t.verbose = true
16
+ end
17
+
18
+ desc 'Generate documentation.'
19
+ RDoc::Task.new(:rdoc) do |rdoc|
20
+ rdoc.rdoc_dir = 'rdoc'
21
+ rdoc.title = 'Rubadoop'
22
+ rdoc.options << '--line-numbers' << '--inline-source'
23
+ rdoc.rdoc_files.include('README.rdoc')
24
+ rdoc.rdoc_files.include('lib/**/*.rb')
25
+ end
@@ -0,0 +1,26 @@
1
+ require 'active_support/dependencies/autoload'
2
+ require 'active_support/concern'
3
+ require 'active_support/core_ext/class/attribute'
4
+ require 'active_support/inflector'
5
+
6
+ module Rubadoop
7
+ extend ActiveSupport::Autoload
8
+
9
+ autoload :BaseDsl
10
+ autoload :MapReduce
11
+
12
+ module Emr
13
+ extend ActiveSupport::Autoload
14
+
15
+ autoload :JobflowBuilder
16
+ end
17
+
18
+ module Oozie
19
+ extend ActiveSupport::Autoload
20
+
21
+ autoload :WorkflowBuilder
22
+ end
23
+
24
+ end
25
+
26
+ require 'rubadoop/version'
@@ -0,0 +1,31 @@
1
+ require 'active_support/core_ext/hash/indifferent_access'
2
+
3
+ module Rubadoop
4
+ class BaseDsl
5
+ attr_reader :params
6
+
7
+ def initialize(params)
8
+
9
+ unless params.singleton_methods.member? :[]
10
+ params = params ? params.dup.with_indifferent_access : {}.with_indifferent_access
11
+
12
+ def params.[](param)
13
+ value = super(param)
14
+ raise "Missing param: #{param}" if value.nil?
15
+ value
16
+ end
17
+ end
18
+
19
+ @params = params
20
+ end
21
+
22
+ def has_param?(param)
23
+ params.has_key? param
24
+ end
25
+
26
+ def optional_param(param, default_value)
27
+ params[param] = default_value unless has_param? param
28
+ end
29
+ end
30
+ end
31
+
@@ -0,0 +1,23 @@
1
+ module Rubadoop
2
+ module Emr
3
+ JobFlowStates = {
4
+ :starting => 'STARTING',
5
+ :running => 'RUNNING',
6
+ :waiting => 'WAITING',
7
+ :shutting_down => 'SHUTTING_DOWN',
8
+ :terminated => 'TERMINATED',
9
+ :ended => 'ENDED'
10
+ }
11
+ InstanceRoles = {
12
+ :master => 'MASTER',
13
+ :core => 'CORE',
14
+ :task => 'TASK'
15
+ }
16
+ StepActionOnFailure = {
17
+ :terminate_job_flow => 'TERMINATE_JOB_FLOW',
18
+ :cancel_and_wait => 'CANCEL_AND_WAIT',
19
+ :continue => 'CONTINUE'
20
+ }
21
+
22
+ end
23
+ end
@@ -0,0 +1,36 @@
1
+ module Rubadoop
2
+ module Emr
3
+ module JobflowBuilder
4
+ extend ActiveSupport::Autoload
5
+
6
+ autoload :JobSpec
7
+ autoload :Step
8
+ autoload :BootstrapAction
9
+
10
+ class << self
11
+ def new_job_spec(__params__ = {}, &block)
12
+ job_spec = JobSpec.new(__params__)
13
+
14
+ if block_given?
15
+ if block.arity == 1
16
+ yield job_spec
17
+ else
18
+ job_spec.instance_eval &block
19
+ end
20
+ end
21
+
22
+ job_spec
23
+ end
24
+
25
+ def load_job_spec(__params__ = {}, __spec_code__)
26
+ new_job_spec(__params__) do |dsl|
27
+ dsl.instance_eval __spec_code__
28
+ end
29
+ end
30
+
31
+ end
32
+
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,27 @@
1
+ module Rubadoop
2
+ module Emr
3
+ module JobflowBuilder
4
+ class BootstrapAction < Rubadoop::BaseDsl
5
+ attr_accessor :args, :name, :path
6
+
7
+ def initialize(params, path)
8
+ super(params)
9
+ @path = path
10
+ end
11
+
12
+ def arg(*value)
13
+ @args ||= []
14
+ @args.concat value
15
+ end
16
+
17
+ def to_h
18
+ built = {}
19
+ built[:name] = @name || "Bootstrap Action"
20
+ built[:script_bootstrap_action] = {:path => @path}
21
+ built[:script_bootstrap_action][:args] = @args unless @args.nil?
22
+ built
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,77 @@
1
+ module Rubadoop
2
+ module Emr
3
+ module JobflowBuilder
4
+ class JobSpec < Rubadoop::BaseDsl
5
+ attr_accessor :additional_info, :ami_version, :bootstrap_actions, :instances, :log_uri, :name, :steps, :supported_products
6
+ attr_accessor :termination_protected, :visible_to_all_users
7
+
8
+ STREAMING_JAR_LOCATION = '/home/hadoop/contrib/streaming/hadoop-streaming.jar'
9
+
10
+ def with_bootstrap_action(path, &block)
11
+ builder = BootstrapAction.new(params, path)
12
+ if block_given?
13
+ if block.arity == 1
14
+ yield builder
15
+ else
16
+ builder.instance_eval &block
17
+ end
18
+ end
19
+ (@bootstrap_actions ||= []) << builder.to_h
20
+ end
21
+
22
+ def with_instances(master_type, core_type, core_count)
23
+ @instances ||= {}
24
+ @instances[:master_instance_type] = master_type
25
+ @instances[:slave_instance_type] = core_type
26
+ @instances[:instance_count] = core_count + 1
27
+ end
28
+
29
+ def keep_alive(value)
30
+ raise 'keep_alive value must be true/false' unless !!value == value
31
+ @instances ||= {}
32
+ @instances[:keep_job_flow_alive_when_no_steps] = value
33
+ end
34
+
35
+ def add_jar_step(name, jar, main_class = nil, &block)
36
+ builder = Step::Java.new(params, jar, main_class)
37
+ builder.name = name
38
+ if block_given?
39
+ if block.arity == 1
40
+ yield builder
41
+ else
42
+ builder.instance_eval &block
43
+ end
44
+ end
45
+ (@steps ||= []) << builder.to_h
46
+ end
47
+
48
+ def add_streaming_step(name, &block)
49
+ builder = Step::Streaming.new(params, STREAMING_JAR_LOCATION)
50
+ builder.name = name
51
+ if block_given?
52
+ if block.arity == 1
53
+ yield builder
54
+ else
55
+ builder.instance_eval &block
56
+ end
57
+ end
58
+ (@steps ||= []) << builder.to_h
59
+ end
60
+
61
+ def to_create_command
62
+ built = {}
63
+ [:additional_info, :ami_version, :log_uri, :name, :bootstrap_actions, :instances, :supported_products].each do |attr|
64
+ attr_value = self.instance_variable_get("@#{attr}")
65
+ built[attr] = attr_value unless attr_value.nil?
66
+ end
67
+
68
+ built
69
+ end
70
+
71
+ def to_steps_command
72
+ @steps
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,41 @@
1
+ module Rubadoop
2
+ module Emr
3
+ module JobflowBuilder
4
+ module Step
5
+
6
+ private
7
+ module StepBehavior
8
+ attr_accessor :name, :action_on_failure
9
+
10
+ def initialize(params, jar, main_class = nil)
11
+ super(params)
12
+ @name = name
13
+ @jar = jar
14
+ @main_class = main_class if main_class
15
+ end
16
+
17
+ def to_h
18
+ built = {}
19
+ built[:name] = @name unless @name.nil?
20
+ built[:hadoop_jar_step] = {}
21
+ built[:hadoop_jar_step][:jar] = @jar
22
+ built[:hadoop_jar_step][:main_class] = @main_class unless @main_class.nil?
23
+ built[:hadoop_jar_step][:args] = to_hadoop_cli(skip_jar: true)
24
+ built[:action_on_failure] = @action_on_failure unless @action_on_failure.nil?
25
+ built
26
+ end
27
+ end
28
+
29
+ public
30
+ class Streaming < Rubadoop::MapReduce::CallStreaming
31
+ include StepBehavior
32
+ end
33
+
34
+ class Java < Rubadoop::MapReduce::CallJava
35
+ include StepBehavior
36
+ end
37
+
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,23 @@
1
+ require 'rubadoop/map_reduce/io'
2
+ require 'rubadoop/map_reduce/utils'
3
+
4
+ module Rubadoop
5
+ module MapReduce
6
+ extend ActiveSupport::Autoload
7
+
8
+ autoload :CallJava
9
+ autoload :CallStreaming
10
+ autoload :JobConfEnvironment
11
+ autoload :Mappable
12
+ autoload :Mapper
13
+ autoload :Reducable
14
+ autoload :Reducer
15
+ autoload :Identity
16
+ autoload :TestAssist
17
+
18
+ extend Io
19
+ extend Utils
20
+
21
+ end
22
+ end
23
+
@@ -0,0 +1,112 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ class CallJava < BaseDsl
4
+ attr_accessor :jar, :main_class, :envs, :args, :confs, :files, :archives
5
+
6
+ class << self
7
+ def new_java_call(params = {}, &block)
8
+ builder = CallJava.new(params)
9
+ if block_given?
10
+ if block.arity == 1
11
+ yield builder
12
+ else
13
+ builder.instance_eval &block
14
+ end
15
+ end
16
+ builder
17
+ end
18
+ end
19
+
20
+ def initialize(params = {})
21
+ super(params)
22
+ end
23
+
24
+ def env(name, value)
25
+ @envs ||= {}
26
+ @envs[name.to_sym] = value
27
+ end
28
+
29
+ def conf(name, value)
30
+ @confs ||= {}
31
+ @confs[name.to_sym] = value
32
+ end
33
+
34
+ def conf_concat(name, value)
35
+ @confs ||= {}
36
+ prev_value = @confs[name.to_sym]
37
+ if prev_value
38
+ if prev_value.kind_of?(Array)
39
+ prev_value << value
40
+ else
41
+ @confs[name.to_sym] = [prev_value, value]
42
+ end
43
+ else
44
+ @confs[name.to_sym] = value
45
+ end
46
+ end
47
+
48
+ def arg(*value)
49
+ @args ||= []
50
+ @args.concat value
51
+ end
52
+
53
+ def file(location, symlink)
54
+ @files ||= []
55
+ @files << "#{location}##{symlink}"
56
+ end
57
+
58
+ def archive(location, symlink)
59
+ @archives ||= []
60
+ @archives << "#{location}##{symlink}"
61
+ end
62
+
63
+ def to_hadoop_cli(opts = {})
64
+ validate
65
+ cmd = []
66
+ unless opts[:skip_jar]
67
+ cmd.concat ["hadoop", "jar", jar]
68
+ cmd << @main_class unless @main_class.nil?
69
+ end
70
+
71
+ @confs.each { |key, value|
72
+ if value.kind_of?(Array)
73
+ value.each { |entry|
74
+ cmd. << "-D#{key}=#{entry}"
75
+ }
76
+ else
77
+ cmd. << "-D#{key}=#{value}"
78
+ end
79
+ } if @confs
80
+ @envs.each { |key, value|
81
+ cmd.concat ['-cmdenv', "#{key}=#{value}"]
82
+ } if @envs
83
+ @files.each { |entry|
84
+ cmd.concat ['-cacheFile', "#{entry}"]
85
+ } if @files
86
+ @archives.each { |entry|
87
+ cmd.concat ['-cacheArchive', "#{entry}"]
88
+ } if @archives
89
+ unless opts[:skip_args]
90
+ cmd.concat @args if @args
91
+ end
92
+ cmd
93
+ end
94
+
95
+ def to_h
96
+ validate
97
+ built = {}
98
+ [:@jar, :@main_class, :@envs, :@args, :@confs, :@files, :@archives].each { |entry|
99
+ built[entry.to_s.delete("@").to_sym] = instance_variable_get(entry)
100
+ }
101
+ built
102
+ end
103
+
104
+ def validate
105
+ [:@jar].each { |property|
106
+ raise "Missing #{property}" if instance_variable_get(property).nil?
107
+ }
108
+ end
109
+
110
+ end
111
+ end
112
+ end