rubadoop 0.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +7 -0
  3. data/Gemfile.lock +43 -0
  4. data/README.rdoc +7 -0
  5. data/Rakefile +25 -0
  6. data/lib/rubadoop.rb +26 -0
  7. data/lib/rubadoop/base_dsl.rb +31 -0
  8. data/lib/rubadoop/emr.rb +23 -0
  9. data/lib/rubadoop/emr/jobflow_builder.rb +36 -0
  10. data/lib/rubadoop/emr/jobflow_builder/bootstrap_action.rb +27 -0
  11. data/lib/rubadoop/emr/jobflow_builder/job_spec.rb +77 -0
  12. data/lib/rubadoop/emr/jobflow_builder/step.rb +41 -0
  13. data/lib/rubadoop/map_reduce.rb +23 -0
  14. data/lib/rubadoop/map_reduce/call_java.rb +112 -0
  15. data/lib/rubadoop/map_reduce/call_streaming.rb +55 -0
  16. data/lib/rubadoop/map_reduce/identity.rb +30 -0
  17. data/lib/rubadoop/map_reduce/io.rb +128 -0
  18. data/lib/rubadoop/map_reduce/job_conf_environment.rb +9 -0
  19. data/lib/rubadoop/map_reduce/mappable.rb +59 -0
  20. data/lib/rubadoop/map_reduce/mapper.rb +15 -0
  21. data/lib/rubadoop/map_reduce/reducable.rb +74 -0
  22. data/lib/rubadoop/map_reduce/reducer.rb +12 -0
  23. data/lib/rubadoop/map_reduce/test_assist.rb +65 -0
  24. data/lib/rubadoop/map_reduce/utils.rb +29 -0
  25. data/lib/rubadoop/oozie/workflow_builder.rb +42 -0
  26. data/lib/rubadoop/oozie/workflow_builder/job_properties.rb +19 -0
  27. data/lib/rubadoop/version.rb +3 -0
  28. data/test/rubadoop/base_dsl_test.rb +27 -0
  29. data/test/rubadoop/emr/jobflow_builder_test.rb +184 -0
  30. data/test/rubadoop/map_reduce/call_java_test.rb +122 -0
  31. data/test/rubadoop/map_reduce/call_streaming_test.rb +81 -0
  32. data/test/rubadoop/map_reduce/identity_test.rb +40 -0
  33. data/test/rubadoop/map_reduce/io_test.rb +51 -0
  34. data/test/rubadoop/map_reduce/job_conf_environment_test.rb +28 -0
  35. data/test/rubadoop/map_reduce/mappable_test.rb +62 -0
  36. data/test/rubadoop/map_reduce/mapper_test.rb +76 -0
  37. data/test/rubadoop/map_reduce/reducable_test.rb +12 -0
  38. data/test/rubadoop/map_reduce/reducer_test.rb +137 -0
  39. data/test/rubadoop/map_reduce/test_assist_test.rb +76 -0
  40. data/test/rubadoop/oozie/workflow_builder_test.rb +21 -0
  41. data/test/test_helper.rb +10 -0
  42. metadata +140 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e877fa9607f54419f1a7cd9d2033439465bc7fd1
4
+ data.tar.gz: 99cd20b45eb293eae96a2836ae0e207b83d38ff3
5
+ SHA512:
6
+ metadata.gz: 22c74443aabfe4b73edba04d89ccd4ce2ddd9eaaa8c3f7db389165a359f566a4d83f5608d051ed1af92700b5ce9b45177a18678155785f8677ee5d13320659ac
7
+ data.tar.gz: 570d9085f2dc997b908d09d9a12ec3d5041f85e6ae95cb260f3753de6ecab3bb23c2293bcfdd111ad7bbcd5551c9a548746891a00bd1093cf4279ca5307abd54
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source "http://rubygems.org"
2
+
3
+ gemspec
4
+
5
+ group :test do
6
+ gem 'simplecov', require: false
7
+ end
@@ -0,0 +1,43 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ rubadoop (0.7.7)
5
+ activesupport (>= 3)
6
+ aws-sdk (>= 1.5.8)
7
+
8
+ GEM
9
+ remote: http://rubygems.org/
10
+ specs:
11
+ activesupport (4.0.0)
12
+ i18n (~> 0.6, >= 0.6.4)
13
+ minitest (~> 4.2)
14
+ multi_json (~> 1.3)
15
+ thread_safe (~> 0.1)
16
+ tzinfo (~> 0.3.37)
17
+ atomic (1.1.14)
18
+ aws-sdk (1.24.0)
19
+ json (~> 1.4)
20
+ nokogiri (>= 1.4.4, < 1.6.0)
21
+ uuidtools (~> 2.1)
22
+ i18n (0.6.5)
23
+ json (1.8.1)
24
+ minitest (4.7.5)
25
+ multi_json (1.8.2)
26
+ nokogiri (1.5.10)
27
+ rdoc (4.0.0)
28
+ simplecov (0.7.1)
29
+ multi_json (~> 1.0)
30
+ simplecov-html (~> 0.7.1)
31
+ simplecov-html (0.7.1)
32
+ thread_safe (0.1.3)
33
+ atomic
34
+ tzinfo (0.3.38)
35
+ uuidtools (2.1.4)
36
+
37
+ PLATFORMS
38
+ ruby
39
+
40
+ DEPENDENCIES
41
+ rdoc
42
+ rubadoop!
43
+ simplecov
@@ -0,0 +1,7 @@
1
+ Rubadoop provides:
2
+
3
+ * Streaming support structure
4
+ * Hadoop Definition DSL
5
+ * Amazon::EMR Definition DSL
6
+ * Oozie Definition DSL
7
+
@@ -0,0 +1,25 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rdoc/task'
4
+
5
+ task default: :test
6
+
7
+ task :build do
8
+ exec 'gem build rubadoop.gemspec'
9
+ end
10
+
11
+ Rake::TestTask.new(:test) do |t|
12
+ t.libs << 'lib'
13
+ t.libs << 'test'
14
+ t.pattern = 'test/**/*_test.rb'
15
+ t.verbose = true
16
+ end
17
+
18
+ desc 'Generate documentation.'
19
+ RDoc::Task.new(:rdoc) do |rdoc|
20
+ rdoc.rdoc_dir = 'rdoc'
21
+ rdoc.title = 'Rubadoop'
22
+ rdoc.options << '--line-numbers' << '--inline-source'
23
+ rdoc.rdoc_files.include('README.rdoc')
24
+ rdoc.rdoc_files.include('lib/**/*.rb')
25
+ end
@@ -0,0 +1,26 @@
1
+ require 'active_support/dependencies/autoload'
2
+ require 'active_support/concern'
3
+ require 'active_support/core_ext/class/attribute'
4
+ require 'active_support/inflector'
5
+
6
+ module Rubadoop
7
+ extend ActiveSupport::Autoload
8
+
9
+ autoload :BaseDsl
10
+ autoload :MapReduce
11
+
12
+ module Emr
13
+ extend ActiveSupport::Autoload
14
+
15
+ autoload :JobflowBuilder
16
+ end
17
+
18
+ module Oozie
19
+ extend ActiveSupport::Autoload
20
+
21
+ autoload :WorkflowBuilder
22
+ end
23
+
24
+ end
25
+
26
+ require 'rubadoop/version'
@@ -0,0 +1,31 @@
1
+ require 'active_support/core_ext/hash/indifferent_access'
2
+
3
+ module Rubadoop
4
+ class BaseDsl
5
+ attr_reader :params
6
+
7
+ def initialize(params)
8
+
9
+ unless params.singleton_methods.member? :[]
10
+ params = params ? params.dup.with_indifferent_access : {}.with_indifferent_access
11
+
12
+ def params.[](param)
13
+ value = super(param)
14
+ raise "Missing param: #{param}" if value.nil?
15
+ value
16
+ end
17
+ end
18
+
19
+ @params = params
20
+ end
21
+
22
+ def has_param?(param)
23
+ params.has_key? param
24
+ end
25
+
26
+ def optional_param(param, default_value)
27
+ params[param] = default_value unless has_param? param
28
+ end
29
+ end
30
+ end
31
+
@@ -0,0 +1,23 @@
1
+ module Rubadoop
2
+ module Emr
3
+ JobFlowStates = {
4
+ :starting => 'STARTING',
5
+ :running => 'RUNNING',
6
+ :waiting => 'WAITING',
7
+ :shutting_down => 'SHUTTING_DOWN',
8
+ :terminated => 'TERMINATED',
9
+ :ended => 'ENDED'
10
+ }
11
+ InstanceRoles = {
12
+ :master => 'MASTER',
13
+ :core => 'CORE',
14
+ :task => 'TASK'
15
+ }
16
+ StepActionOnFailure = {
17
+ :terminate_job_flow => 'TERMINATE_JOB_FLOW',
18
+ :cancel_and_wait => 'CANCEL_AND_WAIT',
19
+ :continue => 'CONTINUE'
20
+ }
21
+
22
+ end
23
+ end
@@ -0,0 +1,36 @@
1
+ module Rubadoop
2
+ module Emr
3
+ module JobflowBuilder
4
+ extend ActiveSupport::Autoload
5
+
6
+ autoload :JobSpec
7
+ autoload :Step
8
+ autoload :BootstrapAction
9
+
10
+ class << self
11
+ def new_job_spec(__params__ = {}, &block)
12
+ job_spec = JobSpec.new(__params__)
13
+
14
+ if block_given?
15
+ if block.arity == 1
16
+ yield job_spec
17
+ else
18
+ job_spec.instance_eval &block
19
+ end
20
+ end
21
+
22
+ job_spec
23
+ end
24
+
25
+ def load_job_spec(__params__ = {}, __spec_code__)
26
+ new_job_spec(__params__) do |dsl|
27
+ dsl.instance_eval __spec_code__
28
+ end
29
+ end
30
+
31
+ end
32
+
33
+ end
34
+ end
35
+ end
36
+
@@ -0,0 +1,27 @@
1
+ module Rubadoop
2
+ module Emr
3
+ module JobflowBuilder
4
+ class BootstrapAction < Rubadoop::BaseDsl
5
+ attr_accessor :args, :name, :path
6
+
7
+ def initialize(params, path)
8
+ super(params)
9
+ @path = path
10
+ end
11
+
12
+ def arg(*value)
13
+ @args ||= []
14
+ @args.concat value
15
+ end
16
+
17
+ def to_h
18
+ built = {}
19
+ built[:name] = @name || "Bootstrap Action"
20
+ built[:script_bootstrap_action] = {:path => @path}
21
+ built[:script_bootstrap_action][:args] = @args unless @args.nil?
22
+ built
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,77 @@
1
+ module Rubadoop
2
+ module Emr
3
+ module JobflowBuilder
4
+ class JobSpec < Rubadoop::BaseDsl
5
+ attr_accessor :additional_info, :ami_version, :bootstrap_actions, :instances, :log_uri, :name, :steps, :supported_products
6
+ attr_accessor :termination_protected, :visible_to_all_users
7
+
8
+ STREAMING_JAR_LOCATION = '/home/hadoop/contrib/streaming/hadoop-streaming.jar'
9
+
10
+ def with_bootstrap_action(path, &block)
11
+ builder = BootstrapAction.new(params, path)
12
+ if block_given?
13
+ if block.arity == 1
14
+ yield builder
15
+ else
16
+ builder.instance_eval &block
17
+ end
18
+ end
19
+ (@bootstrap_actions ||= []) << builder.to_h
20
+ end
21
+
22
+ def with_instances(master_type, core_type, core_count)
23
+ @instances ||= {}
24
+ @instances[:master_instance_type] = master_type
25
+ @instances[:slave_instance_type] = core_type
26
+ @instances[:instance_count] = core_count + 1
27
+ end
28
+
29
+ def keep_alive(value)
30
+ raise 'keep_alive value must be true/false' unless !!value == value
31
+ @instances ||= {}
32
+ @instances[:keep_job_flow_alive_when_no_steps] = value
33
+ end
34
+
35
+ def add_jar_step(name, jar, main_class = nil, &block)
36
+ builder = Step::Java.new(params, jar, main_class)
37
+ builder.name = name
38
+ if block_given?
39
+ if block.arity == 1
40
+ yield builder
41
+ else
42
+ builder.instance_eval &block
43
+ end
44
+ end
45
+ (@steps ||= []) << builder.to_h
46
+ end
47
+
48
+ def add_streaming_step(name, &block)
49
+ builder = Step::Streaming.new(params, STREAMING_JAR_LOCATION)
50
+ builder.name = name
51
+ if block_given?
52
+ if block.arity == 1
53
+ yield builder
54
+ else
55
+ builder.instance_eval &block
56
+ end
57
+ end
58
+ (@steps ||= []) << builder.to_h
59
+ end
60
+
61
+ def to_create_command
62
+ built = {}
63
+ [:additional_info, :ami_version, :log_uri, :name, :bootstrap_actions, :instances, :supported_products].each do |attr|
64
+ attr_value = self.instance_variable_get("@#{attr}")
65
+ built[attr] = attr_value unless attr_value.nil?
66
+ end
67
+
68
+ built
69
+ end
70
+
71
+ def to_steps_command
72
+ @steps
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,41 @@
1
+ module Rubadoop
2
+ module Emr
3
+ module JobflowBuilder
4
+ module Step
5
+
6
+ private
7
+ module StepBehavior
8
+ attr_accessor :name, :action_on_failure
9
+
10
+ def initialize(params, jar, main_class = nil)
11
+ super(params)
12
+ @name = name
13
+ @jar = jar
14
+ @main_class = main_class if main_class
15
+ end
16
+
17
+ def to_h
18
+ built = {}
19
+ built[:name] = @name unless @name.nil?
20
+ built[:hadoop_jar_step] = {}
21
+ built[:hadoop_jar_step][:jar] = @jar
22
+ built[:hadoop_jar_step][:main_class] = @main_class unless @main_class.nil?
23
+ built[:hadoop_jar_step][:args] = to_hadoop_cli(skip_jar: true)
24
+ built[:action_on_failure] = @action_on_failure unless @action_on_failure.nil?
25
+ built
26
+ end
27
+ end
28
+
29
+ public
30
+ class Streaming < Rubadoop::MapReduce::CallStreaming
31
+ include StepBehavior
32
+ end
33
+
34
+ class Java < Rubadoop::MapReduce::CallJava
35
+ include StepBehavior
36
+ end
37
+
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,23 @@
1
+ require 'rubadoop/map_reduce/io'
2
+ require 'rubadoop/map_reduce/utils'
3
+
4
+ module Rubadoop
5
+ module MapReduce
6
+ extend ActiveSupport::Autoload
7
+
8
+ autoload :CallJava
9
+ autoload :CallStreaming
10
+ autoload :JobConfEnvironment
11
+ autoload :Mappable
12
+ autoload :Mapper
13
+ autoload :Reducable
14
+ autoload :Reducer
15
+ autoload :Identity
16
+ autoload :TestAssist
17
+
18
+ extend Io
19
+ extend Utils
20
+
21
+ end
22
+ end
23
+
@@ -0,0 +1,112 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ class CallJava < BaseDsl
4
+ attr_accessor :jar, :main_class, :envs, :args, :confs, :files, :archives
5
+
6
+ class << self
7
+ def new_java_call(params = {}, &block)
8
+ builder = CallJava.new(params)
9
+ if block_given?
10
+ if block.arity == 1
11
+ yield builder
12
+ else
13
+ builder.instance_eval &block
14
+ end
15
+ end
16
+ builder
17
+ end
18
+ end
19
+
20
+ def initialize(params = {})
21
+ super(params)
22
+ end
23
+
24
+ def env(name, value)
25
+ @envs ||= {}
26
+ @envs[name.to_sym] = value
27
+ end
28
+
29
+ def conf(name, value)
30
+ @confs ||= {}
31
+ @confs[name.to_sym] = value
32
+ end
33
+
34
+ def conf_concat(name, value)
35
+ @confs ||= {}
36
+ prev_value = @confs[name.to_sym]
37
+ if prev_value
38
+ if prev_value.kind_of?(Array)
39
+ prev_value << value
40
+ else
41
+ @confs[name.to_sym] = [prev_value, value]
42
+ end
43
+ else
44
+ @confs[name.to_sym] = value
45
+ end
46
+ end
47
+
48
+ def arg(*value)
49
+ @args ||= []
50
+ @args.concat value
51
+ end
52
+
53
+ def file(location, symlink)
54
+ @files ||= []
55
+ @files << "#{location}##{symlink}"
56
+ end
57
+
58
+ def archive(location, symlink)
59
+ @archives ||= []
60
+ @archives << "#{location}##{symlink}"
61
+ end
62
+
63
+ def to_hadoop_cli(opts = {})
64
+ validate
65
+ cmd = []
66
+ unless opts[:skip_jar]
67
+ cmd.concat ["hadoop", "jar", jar]
68
+ cmd << @main_class unless @main_class.nil?
69
+ end
70
+
71
+ @confs.each { |key, value|
72
+ if value.kind_of?(Array)
73
+ value.each { |entry|
74
+ cmd. << "-D#{key}=#{entry}"
75
+ }
76
+ else
77
+ cmd. << "-D#{key}=#{value}"
78
+ end
79
+ } if @confs
80
+ @envs.each { |key, value|
81
+ cmd.concat ['-cmdenv', "#{key}=#{value}"]
82
+ } if @envs
83
+ @files.each { |entry|
84
+ cmd.concat ['-cacheFile', "#{entry}"]
85
+ } if @files
86
+ @archives.each { |entry|
87
+ cmd.concat ['-cacheArchive', "#{entry}"]
88
+ } if @archives
89
+ unless opts[:skip_args]
90
+ cmd.concat @args if @args
91
+ end
92
+ cmd
93
+ end
94
+
95
+ def to_h
96
+ validate
97
+ built = {}
98
+ [:@jar, :@main_class, :@envs, :@args, :@confs, :@files, :@archives].each { |entry|
99
+ built[entry.to_s.delete("@").to_sym] = instance_variable_get(entry)
100
+ }
101
+ built
102
+ end
103
+
104
+ def validate
105
+ [:@jar].each { |property|
106
+ raise "Missing #{property}" if instance_variable_get(property).nil?
107
+ }
108
+ end
109
+
110
+ end
111
+ end
112
+ end