rubadoop 0.7.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +7 -0
  3. data/Gemfile.lock +43 -0
  4. data/README.rdoc +7 -0
  5. data/Rakefile +25 -0
  6. data/lib/rubadoop.rb +26 -0
  7. data/lib/rubadoop/base_dsl.rb +31 -0
  8. data/lib/rubadoop/emr.rb +23 -0
  9. data/lib/rubadoop/emr/jobflow_builder.rb +36 -0
  10. data/lib/rubadoop/emr/jobflow_builder/bootstrap_action.rb +27 -0
  11. data/lib/rubadoop/emr/jobflow_builder/job_spec.rb +77 -0
  12. data/lib/rubadoop/emr/jobflow_builder/step.rb +41 -0
  13. data/lib/rubadoop/map_reduce.rb +23 -0
  14. data/lib/rubadoop/map_reduce/call_java.rb +112 -0
  15. data/lib/rubadoop/map_reduce/call_streaming.rb +55 -0
  16. data/lib/rubadoop/map_reduce/identity.rb +30 -0
  17. data/lib/rubadoop/map_reduce/io.rb +128 -0
  18. data/lib/rubadoop/map_reduce/job_conf_environment.rb +9 -0
  19. data/lib/rubadoop/map_reduce/mappable.rb +59 -0
  20. data/lib/rubadoop/map_reduce/mapper.rb +15 -0
  21. data/lib/rubadoop/map_reduce/reducable.rb +74 -0
  22. data/lib/rubadoop/map_reduce/reducer.rb +12 -0
  23. data/lib/rubadoop/map_reduce/test_assist.rb +65 -0
  24. data/lib/rubadoop/map_reduce/utils.rb +29 -0
  25. data/lib/rubadoop/oozie/workflow_builder.rb +42 -0
  26. data/lib/rubadoop/oozie/workflow_builder/job_properties.rb +19 -0
  27. data/lib/rubadoop/version.rb +3 -0
  28. data/test/rubadoop/base_dsl_test.rb +27 -0
  29. data/test/rubadoop/emr/jobflow_builder_test.rb +184 -0
  30. data/test/rubadoop/map_reduce/call_java_test.rb +122 -0
  31. data/test/rubadoop/map_reduce/call_streaming_test.rb +81 -0
  32. data/test/rubadoop/map_reduce/identity_test.rb +40 -0
  33. data/test/rubadoop/map_reduce/io_test.rb +51 -0
  34. data/test/rubadoop/map_reduce/job_conf_environment_test.rb +28 -0
  35. data/test/rubadoop/map_reduce/mappable_test.rb +62 -0
  36. data/test/rubadoop/map_reduce/mapper_test.rb +76 -0
  37. data/test/rubadoop/map_reduce/reducable_test.rb +12 -0
  38. data/test/rubadoop/map_reduce/reducer_test.rb +137 -0
  39. data/test/rubadoop/map_reduce/test_assist_test.rb +76 -0
  40. data/test/rubadoop/oozie/workflow_builder_test.rb +21 -0
  41. data/test/test_helper.rb +10 -0
  42. metadata +140 -0
@@ -0,0 +1,29 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module Utils
4
+ def key_value_split(line)
5
+ line.split("\t", 2)
6
+ end
7
+
8
+ def log_counter(group, counter, amount)
9
+ ::Rubadoop::MapReduce.out.counter(group, counter, amount)
10
+ end
11
+
12
+ def log_status(status)
13
+ ::Rubadoop::MapReduce.out.status(status)
14
+ end
15
+
16
+ def log_error(message)
17
+ ::Rubadoop::MapReduce.out.error(message)
18
+ end
19
+
20
+ def out_entry(value)
21
+ ::Rubadoop::MapReduce.out.entry(value)
22
+ end
23
+
24
+ def out_map_entry(key, value)
25
+ ::Rubadoop::MapReduce.out.map_entry(key, value)
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,42 @@
1
+ module Rubadoop
2
+ module Oozie
3
+ module WorkflowBuilder
4
+ extend ActiveSupport::Autoload
5
+
6
+ autoload :JobProperties
7
+
8
+ class << self
9
+ def new_job_properties(params = {}, &block)
10
+ job_spec = JobProperties.new(params)
11
+
12
+ if block_given?
13
+ if block.arity == 1
14
+ yield job_spec
15
+ else
16
+ job_spec.instance_eval &block
17
+ end
18
+ end
19
+
20
+ unless params.has_key? :skip_aws_keys
21
+ require 'aws-sdk'
22
+ aws_config = AWS.config.credentials
23
+ job_spec.prop 'fs.s3n.awsAccessKeyId', aws_config[:access_key_id]
24
+ job_spec.prop 'fs.s3.awsAccessKeyId', aws_config[:access_key_id]
25
+ job_spec.prop 'fs.s3n.awsSecretAccessKey', aws_config[:secret_access_key]
26
+ job_spec.prop 'fs.s3.awsSecretAccessKey', aws_config[:secret_access_key]
27
+ end
28
+
29
+ job_spec
30
+ end
31
+
32
+ def load_job_properties(__params__ = {}, __spec_code__)
33
+ new_job_properties(__params__) do |dsl|
34
+ dsl.instance_eval __spec_code__
35
+ end
36
+ end
37
+
38
+
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,19 @@
1
+ module Rubadoop
2
+ module Oozie
3
+ module WorkflowBuilder
4
+ class JobProperties < Rubadoop::BaseDsl
5
+
6
+ def prop(name, value)
7
+ @props ||= {}
8
+ @props[name] = value
9
+ end
10
+
11
+ def to_h
12
+ @props ||= {}
13
+ @props
14
+ end
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,3 @@
1
+ module Rubadoop
2
+ VERSION = "0.7.8"
3
+ end
@@ -0,0 +1,27 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ class BaseDslTest < MiniTest::Spec
5
+
6
+ def test_params_missing
7
+ test = self
8
+ BaseDsl.new(Hash[a: true, 'dude' => 'rubadoop']).instance_eval do
9
+ test.assert params[:a]
10
+ test.assert_raises RuntimeError do
11
+ params[:nope]
12
+ end
13
+ end
14
+ end
15
+
16
+ def test_params_with_indifferent_access
17
+ test = self
18
+ BaseDsl.new(Hash[a: true, 'dude' => 'rubadoop']).instance_eval do
19
+ test.assert params[:a]
20
+ test.assert params['a']
21
+ test.assert_equal 'rubadoop', params[:dude]
22
+ test.assert_equal 'rubadoop', params['dude']
23
+ end
24
+ end
25
+ end
26
+ end
27
+
@@ -0,0 +1,184 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module Emr
5
+ class JobflowBuilderTest < MiniTest::Spec
6
+
7
+ def test_base
8
+ command = JobflowBuilder.new_job_spec do; end.to_create_command
9
+ assert_equal command, Hash.new
10
+
11
+ command = JobflowBuilder.new_job_spec do
12
+ my_name = "testink"
13
+ @name = my_name
14
+ end.to_create_command
15
+ assert_equal command, Hash[name: 'testink']
16
+ end
17
+
18
+ def test_param
19
+ command = JobflowBuilder.new_job_spec(name: 'testink') do
20
+ @name = params[:name]
21
+ end.to_create_command
22
+ assert_equal command, Hash[name: 'testink']
23
+ end
24
+
25
+ def test_missing_param
26
+ assert_raises RuntimeError do
27
+ JobflowBuilder.new_job_spec(name: 'testink') do
28
+ @name = params[:namey]
29
+ end.to_create_command
30
+ end
31
+
32
+ assert_raises RuntimeError do
33
+ JobflowBuilder.new_job_spec(name: 'testink') do |s|
34
+ s.name = s.params[:namey]
35
+ end.to_create_command
36
+ end
37
+
38
+ assert_raises NameError do
39
+ JobflowBuilder.new_job_spec(name: 'testink') do |s|
40
+ s.name = params[:namey]
41
+ end.to_create_command
42
+ end
43
+ end
44
+
45
+ def test_with_instances
46
+ command = JobflowBuilder.new_job_spec(name: 'something') do
47
+ optional_param :instance_type, 'c1.medium'
48
+ optional_param :instance_count, 1
49
+
50
+ with_instances 'c1.medium', params[:instance_type], params[:instance_count]
51
+ end.to_create_command
52
+ assert_equal command, {:instances=>{:master_instance_type=>"c1.medium", :slave_instance_type=>"c1.medium", :instance_count=>2}}
53
+ end
54
+
55
+ def test_bootstrap
56
+
57
+ command = JobflowBuilder.new_job_spec do
58
+ with_bootstrap_action "bootstrap location" do |b|
59
+ b.name = 'bootstrap name'
60
+ b.args = ['arg1', 'arg2']
61
+ end
62
+ end.to_create_command
63
+ assert_equal command, Hash[bootstrap_actions:[{name:"bootstrap name",
64
+ script_bootstrap_action:{path:"bootstrap location",
65
+ args:["arg1", "arg2"]}}]]
66
+
67
+ command = JobflowBuilder.new_job_spec do
68
+ with_bootstrap_action "bootstrap location"
69
+ end.to_create_command
70
+ assert_equal command, Hash[bootstrap_actions:[{name:"Bootstrap Action",
71
+ script_bootstrap_action:{path:"bootstrap location"}}]]
72
+
73
+ command = JobflowBuilder.new_job_spec do
74
+ with_bootstrap_action "bootstrap location" do
75
+ @name = 'bootstrap name'
76
+ arg "arg1", "arg2"
77
+ end
78
+ end.to_create_command
79
+ assert_equal command, Hash[bootstrap_actions:[{name:"bootstrap name",
80
+ script_bootstrap_action:{path:"bootstrap location",
81
+ args:["arg1", "arg2"]}}]]
82
+ end
83
+
84
+ def test_bootstrap_param
85
+ command = JobflowBuilder.new_job_spec(bs_name: "total bs") do
86
+ with_bootstrap_action "bootstrap location" do |b|
87
+ b.name = b.params[:bs_name]
88
+ end
89
+ end.to_create_command
90
+ assert_equal command, Hash[bootstrap_actions:[{name:"total bs",
91
+ script_bootstrap_action:{path:"bootstrap location"}}]]
92
+
93
+ command = JobflowBuilder.new_job_spec(bs_name: "total bs") do
94
+ with_bootstrap_action "bootstrap location" do
95
+ @name = params[:bs_name]
96
+ end
97
+ end.to_create_command
98
+ assert_equal command, Hash[bootstrap_actions:[{name:"total bs",
99
+ script_bootstrap_action:{path:"bootstrap location"}}]]
100
+ end
101
+
102
+ def test_jar_steps
103
+ command = JobflowBuilder.new_job_spec do
104
+ add_jar_step 'jar name', 'jar jar', 'Binks' do |s|
105
+ s.args = ["Meesa", "Stupid"]
106
+ s.action_on_failure = 'CANCEL_AND_WAIT'
107
+ end
108
+ end.to_steps_command
109
+ assert_equal [{name:"jar name", hadoop_jar_step:{jar:"jar jar", main_class:"Binks",
110
+ args:["Meesa", "Stupid"]},
111
+ action_on_failure:"CANCEL_AND_WAIT"}], command
112
+ end
113
+
114
+ def test_streaming_steps
115
+ command = JobflowBuilder.new_job_spec do
116
+ add_streaming_step "streaming name" do |s|
117
+ s.mapper = 'mapit'
118
+ s.reducer = 'reduceit'
119
+ s.input = 'fromhere'
120
+ s.output = 'tothere'
121
+ s.args = [ 'arg1', 'arg2',]
122
+ s.action_on_failure = 'CANCEL_AND_WAIT'
123
+ end
124
+ end.to_steps_command
125
+ assert_equal [{name:"streaming name", hadoop_jar_step:{jar: JobflowBuilder::JobSpec::STREAMING_JAR_LOCATION,
126
+ args:["-input", "fromhere", "-output","tothere", "-mapper", "mapit", "-reducer", "reduceit", "arg1", "arg2"]},
127
+ action_on_failure:"CANCEL_AND_WAIT"}], command
128
+ end
129
+
130
+ def test_streaming_steps2
131
+ command = JobflowBuilder.new_job_spec do
132
+ add_streaming_step "streaming name" do
133
+ @mapper = 'mapit'
134
+ @reducer = 'reduceit'
135
+ @input = 'fromhere'
136
+ @output = 'tothere'
137
+ @args = [ 'arg1', 'arg2',]
138
+ @action_on_failure = 'CANCEL_AND_WAIT'
139
+ end
140
+ end.to_steps_command
141
+ assert_equal [{name:"streaming name", hadoop_jar_step:{jar: JobflowBuilder::JobSpec::STREAMING_JAR_LOCATION,
142
+ args:["-input", "fromhere", "-output","tothere", "-mapper", "mapit", "-reducer", "reduceit", "arg1", "arg2"]},
143
+ action_on_failure:"CANCEL_AND_WAIT"}], command
144
+ end
145
+
146
+
147
+ def test_steps_params
148
+ command = JobflowBuilder.new_job_spec(arg1: 'aack') do
149
+ add_jar_step 'jar name', 'jar jar', 'Binks' do |s|
150
+ s.arg s.params[:arg1]
151
+ s.action_on_failure = 'CANCEL_AND_WAIT'
152
+ end
153
+ end.to_steps_command
154
+ assert_equal [{name:"jar name", hadoop_jar_step:{jar:"jar jar", main_class:"Binks",
155
+ args:["aack"]},
156
+ action_on_failure:"CANCEL_AND_WAIT"}], command
157
+
158
+ command = JobflowBuilder.new_job_spec(arg1: 'aack') do
159
+ add_jar_step 'jar name', 'jar jar', 'Binks' do
160
+ arg params[:arg1]
161
+ @action_on_failure = 'CANCEL_AND_WAIT'
162
+ end
163
+ end.to_steps_command
164
+ assert_equal [{name:"jar name", hadoop_jar_step:{jar:"jar jar", main_class:"Binks",
165
+ args:["aack"]},
166
+ action_on_failure:"CANCEL_AND_WAIT"}], command
167
+ end
168
+
169
+ def test_keepalive
170
+ command = JobflowBuilder.new_job_spec do |job|
171
+ job.keep_alive true
172
+ end.to_create_command
173
+ assert_equal command, Hash[instances: {keep_job_flow_alive_when_no_steps: true}]
174
+
175
+ assert_raises RuntimeError do
176
+ JobflowBuilder.new_job_spec do
177
+ keep_alive "false" #if this doesn't check boolean, result would be true
178
+ end
179
+ end
180
+ end
181
+
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,122 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+
6
+ class CallJavaTest < MiniTest::Spec
7
+
8
+ def test_simple
9
+ call = CallJava.new_java_call() do |s|
10
+ s.jar = 'hadoop-pie.jar'
11
+ end
12
+ assert_equal "hadoop jar hadoop-pie.jar", call.to_hadoop_cli.join(' ')
13
+ end
14
+
15
+ def test_env
16
+ call = CallJava.new_java_call() do |s|
17
+ s.jar = 'hadoop-pie.jar'
18
+ s.env 'abcd', 'efgh'
19
+ end
20
+ assert_equal "hadoop jar hadoop-pie.jar -cmdenv abcd=efgh", call.to_hadoop_cli.join(' ')
21
+ end
22
+
23
+ def test_conf
24
+ call = CallJava.new_java_call() do |s|
25
+ s.jar = 'hadoop-pie.jar'
26
+ s.conf 'abcd', 'efgh'
27
+ end
28
+ assert_equal "hadoop jar hadoop-pie.jar -Dabcd=efgh", call.to_hadoop_cli.join(' ')
29
+ end
30
+
31
+ def test_conf_multi
32
+ call = CallJava.new_java_call() do |s|
33
+ s.jar = 'hadoop-pie.jar'
34
+ s.conf 'abcd', 'efgh'
35
+ s.conf_concat 'abcd', 'ijkl'
36
+ s.conf_concat 'abcd', 'mnop'
37
+ end
38
+ assert_equal "hadoop jar hadoop-pie.jar -Dabcd=efgh -Dabcd=ijkl -Dabcd=mnop", call.to_hadoop_cli.join(' ')
39
+
40
+ call = CallJava.new_java_call() do |s|
41
+ s.jar = 'hadoop-pie.jar'
42
+ s.conf_concat 'abcd', 'mnop'
43
+ end
44
+ assert_equal "hadoop jar hadoop-pie.jar -Dabcd=mnop", call.to_hadoop_cli.join(' ')
45
+ end
46
+
47
+ def test_arg
48
+ call = CallJava.new_java_call() do
49
+ @jar = 'hadoop-pie.jar'
50
+ arg 'abcd', 'efgh'
51
+ end
52
+ assert_equal "hadoop jar hadoop-pie.jar abcd efgh", call.to_hadoop_cli.join(' ')
53
+ end
54
+
55
+ def test_archive
56
+ call = CallJava.new_java_call() do
57
+ @jar = 'hadoop-pie.jar'
58
+ archive 'dungeon', 'd'
59
+ end
60
+ assert_equal "hadoop jar hadoop-pie.jar -cacheArchive dungeon#d", call.to_hadoop_cli.join(' ')
61
+ end
62
+
63
+ def test_files
64
+ call = CallJava.new_java_call() do
65
+ @jar = 'hadoop-pie.jar'
66
+ file 'phile', 'f'
67
+ end
68
+ assert_equal "hadoop jar hadoop-pie.jar -cacheFile phile#f", call.to_hadoop_cli.join(' ')
69
+ end
70
+
71
+ def test_order
72
+ call = CallJava.new_java_call() do
73
+ @jar = 'hadoop-pie.jar'
74
+ @main_class = 'com.java.package.DoItLive'
75
+ env 'e1', 'dudio'
76
+ env 'e2', 'dudi-rio'
77
+ file 'phile', 'f'
78
+ conf 'c1', 'conf'
79
+ conf 'c2', 'cronf'
80
+ arg 'seriously'
81
+ archive 'dungeon', 'd'
82
+ file 'phile2', 'f2'
83
+ end
84
+ assert_equal "hadoop jar hadoop-pie.jar com.java.package.DoItLive -Dc1=conf -Dc2=cronf -cmdenv e1=dudio -cmdenv e2=dudi-rio -cacheFile phile#f -cacheFile phile2#f2 -cacheArchive dungeon#d seriously", call.to_hadoop_cli.join(' ')
85
+ end
86
+
87
+ def test_validation
88
+ assert_raises RuntimeError do
89
+ CallJava.new_java_call() do
90
+ @main_class = 'com.java.package.DoItLive'
91
+ end.to_hadoop_cli
92
+ end
93
+
94
+ assert_raises RuntimeError do
95
+ CallJava.new_java_call() do
96
+ @main_class = 'com.java.package.DoItLive'
97
+ end.to_h
98
+ end
99
+ end
100
+
101
+ def test_hash
102
+ call = CallJava.new_java_call() do
103
+ @jar = 'hadoop-pie.jar'
104
+ @main_class = 'com.java.package.DoItLive'
105
+ env 'e1', 'dudio'
106
+ env 'e2', 'dudi-rio'
107
+ file 'phile', 'f'
108
+ conf 'c1', 'conf'
109
+ conf 'c2', 'cronf'
110
+ arg 'seriously'
111
+ archive 'dungeon', 'd'
112
+ file 'phile', 'f'
113
+ @poopsy = 'poo'
114
+ end
115
+ assert_equal Hash[jar: "hadoop-pie.jar", archives: ["dungeon#d"],
116
+ main_class: "com.java.package.DoItLive", envs: {e1: "dudio", e2: "dudi-rio"},
117
+ files: ["phile#f", "phile#f"], confs: {c1: "conf", c2: "cronf"}, args: ["seriously"]], call.to_h
118
+ end
119
+
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,81 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+
6
+ class CallStreamingTest < MiniTest::Spec
7
+
8
+ def test_simple
9
+ streaming = CallStreaming.new_streaming_call do |s|
10
+ s.jar = 'hadoop-streaming.jar'
11
+ s.input = 'input'
12
+ s.output = 'output'
13
+ s.mapper = '/bin/cat'
14
+ s.reducer = '/bin/wc'
15
+ end
16
+ assert_equal "hadoop jar hadoop-streaming.jar -input input -output output -mapper /bin/cat -reducer /bin/wc", streaming.to_hadoop_cli.join(' ')
17
+ end
18
+
19
+ def test_hash
20
+ streaming = CallStreaming.new_streaming_call do |s|
21
+ s.jar = 'hadoop-streaming.jar'
22
+ s.inputformat = 'green'
23
+ s.input = 'input'
24
+ s.outputformat = 'blue'
25
+ s.output = 'output'
26
+ s.mapper = '/bin/cat'
27
+ s.reducer = '/bin/wc'
28
+ @test = 'noshow'
29
+ end
30
+ assert_equal Hash[jar: "hadoop-streaming.jar", main_class: nil, envs: nil, args: nil, confs: nil, files: nil, archives: nil,
31
+ mapper: "/bin/cat", reducer: "/bin/wc", inputformat: "green", input: "input", outputformat: "blue", output: "output"], streaming.to_h
32
+ end
33
+
34
+ def test_cli
35
+ streaming = CallStreaming.new_streaming_call do |s|
36
+ s.jar = 'hadoop-streaming.jar'
37
+ s.inputformat = 'green'
38
+ s.input = 'input'
39
+ s.outputformat = 'blue'
40
+ s.output = 'output'
41
+ s.mapper = '/bin/cat'
42
+ s.reducer = '/bin/wc'
43
+ @test = 'noshow'
44
+ end
45
+ assert_equal ["hadoop", "jar", "hadoop-streaming.jar", "-inputformat", "green", "-input", "input", "-outputformat", "blue",
46
+ "-output", "output", "-mapper", "/bin/cat", "-reducer", "/bin/wc"], streaming.to_hadoop_cli
47
+ end
48
+
49
+ def test_validation
50
+ assert_raises RuntimeError do
51
+ CallStreaming.new_streaming_call do |s|
52
+ s.jar = 'hadoop-streaming.jar'
53
+ s.input = 'input'
54
+ s.output = 'output'
55
+ s.reducer = '/bin/wc'
56
+ end.to_hadoop_cli
57
+ end
58
+
59
+ assert_raises RuntimeError do
60
+ CallStreaming.new_streaming_call() do
61
+ @jar = 'hadoop-streaming.jar'
62
+ @input = 'input'
63
+ @mapper = '/bin/cat'
64
+ @reducer = '/bin/wc'
65
+ end.to_h
66
+ end
67
+ end
68
+
69
+ def test_validation_inherited
70
+ assert_raises RuntimeError do
71
+ CallStreaming.new_streaming_call() do |s|
72
+ s.input = 'input'
73
+ s.output = 'output'
74
+ s.mapper = '/bin/cat'
75
+ s.reducer = '/bin/wc'
76
+ end.to_hadoop_cli
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end