rubadoop 0.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +7 -0
  3. data/Gemfile.lock +43 -0
  4. data/README.rdoc +7 -0
  5. data/Rakefile +25 -0
  6. data/lib/rubadoop.rb +26 -0
  7. data/lib/rubadoop/base_dsl.rb +31 -0
  8. data/lib/rubadoop/emr.rb +23 -0
  9. data/lib/rubadoop/emr/jobflow_builder.rb +36 -0
  10. data/lib/rubadoop/emr/jobflow_builder/bootstrap_action.rb +27 -0
  11. data/lib/rubadoop/emr/jobflow_builder/job_spec.rb +77 -0
  12. data/lib/rubadoop/emr/jobflow_builder/step.rb +41 -0
  13. data/lib/rubadoop/map_reduce.rb +23 -0
  14. data/lib/rubadoop/map_reduce/call_java.rb +112 -0
  15. data/lib/rubadoop/map_reduce/call_streaming.rb +55 -0
  16. data/lib/rubadoop/map_reduce/identity.rb +30 -0
  17. data/lib/rubadoop/map_reduce/io.rb +128 -0
  18. data/lib/rubadoop/map_reduce/job_conf_environment.rb +9 -0
  19. data/lib/rubadoop/map_reduce/mappable.rb +59 -0
  20. data/lib/rubadoop/map_reduce/mapper.rb +15 -0
  21. data/lib/rubadoop/map_reduce/reducable.rb +74 -0
  22. data/lib/rubadoop/map_reduce/reducer.rb +12 -0
  23. data/lib/rubadoop/map_reduce/test_assist.rb +65 -0
  24. data/lib/rubadoop/map_reduce/utils.rb +29 -0
  25. data/lib/rubadoop/oozie/workflow_builder.rb +42 -0
  26. data/lib/rubadoop/oozie/workflow_builder/job_properties.rb +19 -0
  27. data/lib/rubadoop/version.rb +3 -0
  28. data/test/rubadoop/base_dsl_test.rb +27 -0
  29. data/test/rubadoop/emr/jobflow_builder_test.rb +184 -0
  30. data/test/rubadoop/map_reduce/call_java_test.rb +122 -0
  31. data/test/rubadoop/map_reduce/call_streaming_test.rb +81 -0
  32. data/test/rubadoop/map_reduce/identity_test.rb +40 -0
  33. data/test/rubadoop/map_reduce/io_test.rb +51 -0
  34. data/test/rubadoop/map_reduce/job_conf_environment_test.rb +28 -0
  35. data/test/rubadoop/map_reduce/mappable_test.rb +62 -0
  36. data/test/rubadoop/map_reduce/mapper_test.rb +76 -0
  37. data/test/rubadoop/map_reduce/reducable_test.rb +12 -0
  38. data/test/rubadoop/map_reduce/reducer_test.rb +137 -0
  39. data/test/rubadoop/map_reduce/test_assist_test.rb +76 -0
  40. data/test/rubadoop/oozie/workflow_builder_test.rb +21 -0
  41. data/test/test_helper.rb +10 -0
  42. metadata +140 -0
@@ -0,0 +1,29 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module Utils
4
+ def key_value_split(line)
5
+ line.split("\t", 2)
6
+ end
7
+
8
+ def log_counter(group, counter, amount)
9
+ ::Rubadoop::MapReduce.out.counter(group, counter, amount)
10
+ end
11
+
12
+ def log_status(status)
13
+ ::Rubadoop::MapReduce.out.status(status)
14
+ end
15
+
16
+ def log_error(message)
17
+ ::Rubadoop::MapReduce.out.error(message)
18
+ end
19
+
20
+ def out_entry(value)
21
+ ::Rubadoop::MapReduce.out.entry(value)
22
+ end
23
+
24
+ def out_map_entry(key, value)
25
+ ::Rubadoop::MapReduce.out.map_entry(key, value)
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,42 @@
1
+ module Rubadoop
2
+ module Oozie
3
+ module WorkflowBuilder
4
+ extend ActiveSupport::Autoload
5
+
6
+ autoload :JobProperties
7
+
8
+ class << self
9
+ def new_job_properties(params = {}, &block)
10
+ job_spec = JobProperties.new(params)
11
+
12
+ if block_given?
13
+ if block.arity == 1
14
+ yield job_spec
15
+ else
16
+ job_spec.instance_eval &block
17
+ end
18
+ end
19
+
20
+ unless params.has_key? :skip_aws_keys
21
+ require 'aws-sdk'
22
+ aws_config = AWS.config.credentials
23
+ job_spec.prop 'fs.s3n.awsAccessKeyId', aws_config[:access_key_id]
24
+ job_spec.prop 'fs.s3.awsAccessKeyId', aws_config[:access_key_id]
25
+ job_spec.prop 'fs.s3n.awsSecretAccessKey', aws_config[:secret_access_key]
26
+ job_spec.prop 'fs.s3.awsSecretAccessKey', aws_config[:secret_access_key]
27
+ end
28
+
29
+ job_spec
30
+ end
31
+
32
+ def load_job_properties(__params__ = {}, __spec_code__)
33
+ new_job_properties(__params__) do |dsl|
34
+ dsl.instance_eval __spec_code__
35
+ end
36
+ end
37
+
38
+
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,19 @@
1
+ module Rubadoop
2
+ module Oozie
3
+ module WorkflowBuilder
4
+ class JobProperties < Rubadoop::BaseDsl
5
+
6
+ def prop(name, value)
7
+ @props ||= {}
8
+ @props[name] = value
9
+ end
10
+
11
+ def to_h
12
+ @props ||= {}
13
+ @props
14
+ end
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,3 @@
1
+ module Rubadoop
2
+ VERSION = "0.7.8"
3
+ end
@@ -0,0 +1,27 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ class BaseDslTest < MiniTest::Spec
5
+
6
+ def test_params_missing
7
+ test = self
8
+ BaseDsl.new(Hash[a: true, 'dude' => 'rubadoop']).instance_eval do
9
+ test.assert params[:a]
10
+ test.assert_raises RuntimeError do
11
+ params[:nope]
12
+ end
13
+ end
14
+ end
15
+
16
+ def test_params_with_indifferent_access
17
+ test = self
18
+ BaseDsl.new(Hash[a: true, 'dude' => 'rubadoop']).instance_eval do
19
+ test.assert params[:a]
20
+ test.assert params['a']
21
+ test.assert_equal 'rubadoop', params[:dude]
22
+ test.assert_equal 'rubadoop', params['dude']
23
+ end
24
+ end
25
+ end
26
+ end
27
+
@@ -0,0 +1,184 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module Emr
5
+ class JobflowBuilderTest < MiniTest::Spec
6
+
7
+ def test_base
8
+ command = JobflowBuilder.new_job_spec do; end.to_create_command
9
+ assert_equal command, Hash.new
10
+
11
+ command = JobflowBuilder.new_job_spec do
12
+ my_name = "testink"
13
+ @name = my_name
14
+ end.to_create_command
15
+ assert_equal command, Hash[name: 'testink']
16
+ end
17
+
18
+ def test_param
19
+ command = JobflowBuilder.new_job_spec(name: 'testink') do
20
+ @name = params[:name]
21
+ end.to_create_command
22
+ assert_equal command, Hash[name: 'testink']
23
+ end
24
+
25
+ def test_missing_param
26
+ assert_raises RuntimeError do
27
+ JobflowBuilder.new_job_spec(name: 'testink') do
28
+ @name = params[:namey]
29
+ end.to_create_command
30
+ end
31
+
32
+ assert_raises RuntimeError do
33
+ JobflowBuilder.new_job_spec(name: 'testink') do |s|
34
+ s.name = s.params[:namey]
35
+ end.to_create_command
36
+ end
37
+
38
+ assert_raises NameError do
39
+ JobflowBuilder.new_job_spec(name: 'testink') do |s|
40
+ s.name = params[:namey]
41
+ end.to_create_command
42
+ end
43
+ end
44
+
45
+ def test_with_instances
46
+ command = JobflowBuilder.new_job_spec(name: 'something') do
47
+ optional_param :instance_type, 'c1.medium'
48
+ optional_param :instance_count, 1
49
+
50
+ with_instances 'c1.medium', params[:instance_type], params[:instance_count]
51
+ end.to_create_command
52
+ assert_equal command, {:instances=>{:master_instance_type=>"c1.medium", :slave_instance_type=>"c1.medium", :instance_count=>2}}
53
+ end
54
+
55
+ def test_bootstrap
56
+
57
+ command = JobflowBuilder.new_job_spec do
58
+ with_bootstrap_action "bootstrap location" do |b|
59
+ b.name = 'bootstrap name'
60
+ b.args = ['arg1', 'arg2']
61
+ end
62
+ end.to_create_command
63
+ assert_equal command, Hash[bootstrap_actions:[{name:"bootstrap name",
64
+ script_bootstrap_action:{path:"bootstrap location",
65
+ args:["arg1", "arg2"]}}]]
66
+
67
+ command = JobflowBuilder.new_job_spec do
68
+ with_bootstrap_action "bootstrap location"
69
+ end.to_create_command
70
+ assert_equal command, Hash[bootstrap_actions:[{name:"Bootstrap Action",
71
+ script_bootstrap_action:{path:"bootstrap location"}}]]
72
+
73
+ command = JobflowBuilder.new_job_spec do
74
+ with_bootstrap_action "bootstrap location" do
75
+ @name = 'bootstrap name'
76
+ arg "arg1", "arg2"
77
+ end
78
+ end.to_create_command
79
+ assert_equal command, Hash[bootstrap_actions:[{name:"bootstrap name",
80
+ script_bootstrap_action:{path:"bootstrap location",
81
+ args:["arg1", "arg2"]}}]]
82
+ end
83
+
84
+ def test_bootstrap_param
85
+ command = JobflowBuilder.new_job_spec(bs_name: "total bs") do
86
+ with_bootstrap_action "bootstrap location" do |b|
87
+ b.name = b.params[:bs_name]
88
+ end
89
+ end.to_create_command
90
+ assert_equal command, Hash[bootstrap_actions:[{name:"total bs",
91
+ script_bootstrap_action:{path:"bootstrap location"}}]]
92
+
93
+ command = JobflowBuilder.new_job_spec(bs_name: "total bs") do
94
+ with_bootstrap_action "bootstrap location" do
95
+ @name = params[:bs_name]
96
+ end
97
+ end.to_create_command
98
+ assert_equal command, Hash[bootstrap_actions:[{name:"total bs",
99
+ script_bootstrap_action:{path:"bootstrap location"}}]]
100
+ end
101
+
102
+ def test_jar_steps
103
+ command = JobflowBuilder.new_job_spec do
104
+ add_jar_step 'jar name', 'jar jar', 'Binks' do |s|
105
+ s.args = ["Meesa", "Stupid"]
106
+ s.action_on_failure = 'CANCEL_AND_WAIT'
107
+ end
108
+ end.to_steps_command
109
+ assert_equal [{name:"jar name", hadoop_jar_step:{jar:"jar jar", main_class:"Binks",
110
+ args:["Meesa", "Stupid"]},
111
+ action_on_failure:"CANCEL_AND_WAIT"}], command
112
+ end
113
+
114
+ def test_streaming_steps
115
+ command = JobflowBuilder.new_job_spec do
116
+ add_streaming_step "streaming name" do |s|
117
+ s.mapper = 'mapit'
118
+ s.reducer = 'reduceit'
119
+ s.input = 'fromhere'
120
+ s.output = 'tothere'
121
+ s.args = [ 'arg1', 'arg2',]
122
+ s.action_on_failure = 'CANCEL_AND_WAIT'
123
+ end
124
+ end.to_steps_command
125
+ assert_equal [{name:"streaming name", hadoop_jar_step:{jar: JobflowBuilder::JobSpec::STREAMING_JAR_LOCATION,
126
+ args:["-input", "fromhere", "-output","tothere", "-mapper", "mapit", "-reducer", "reduceit", "arg1", "arg2"]},
127
+ action_on_failure:"CANCEL_AND_WAIT"}], command
128
+ end
129
+
130
+ def test_streaming_steps2
131
+ command = JobflowBuilder.new_job_spec do
132
+ add_streaming_step "streaming name" do
133
+ @mapper = 'mapit'
134
+ @reducer = 'reduceit'
135
+ @input = 'fromhere'
136
+ @output = 'tothere'
137
+ @args = [ 'arg1', 'arg2',]
138
+ @action_on_failure = 'CANCEL_AND_WAIT'
139
+ end
140
+ end.to_steps_command
141
+ assert_equal [{name:"streaming name", hadoop_jar_step:{jar: JobflowBuilder::JobSpec::STREAMING_JAR_LOCATION,
142
+ args:["-input", "fromhere", "-output","tothere", "-mapper", "mapit", "-reducer", "reduceit", "arg1", "arg2"]},
143
+ action_on_failure:"CANCEL_AND_WAIT"}], command
144
+ end
145
+
146
+
147
+ def test_steps_params
148
+ command = JobflowBuilder.new_job_spec(arg1: 'aack') do
149
+ add_jar_step 'jar name', 'jar jar', 'Binks' do |s|
150
+ s.arg s.params[:arg1]
151
+ s.action_on_failure = 'CANCEL_AND_WAIT'
152
+ end
153
+ end.to_steps_command
154
+ assert_equal [{name:"jar name", hadoop_jar_step:{jar:"jar jar", main_class:"Binks",
155
+ args:["aack"]},
156
+ action_on_failure:"CANCEL_AND_WAIT"}], command
157
+
158
+ command = JobflowBuilder.new_job_spec(arg1: 'aack') do
159
+ add_jar_step 'jar name', 'jar jar', 'Binks' do
160
+ arg params[:arg1]
161
+ @action_on_failure = 'CANCEL_AND_WAIT'
162
+ end
163
+ end.to_steps_command
164
+ assert_equal [{name:"jar name", hadoop_jar_step:{jar:"jar jar", main_class:"Binks",
165
+ args:["aack"]},
166
+ action_on_failure:"CANCEL_AND_WAIT"}], command
167
+ end
168
+
169
+ def test_keepalive
170
+ command = JobflowBuilder.new_job_spec do |job|
171
+ job.keep_alive true
172
+ end.to_create_command
173
+ assert_equal command, Hash[instances: {keep_job_flow_alive_when_no_steps: true}]
174
+
175
+ assert_raises RuntimeError do
176
+ JobflowBuilder.new_job_spec do
177
+ keep_alive "false" #if this doesn't check boolean, result would be true
178
+ end
179
+ end
180
+ end
181
+
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,122 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+
6
+ class CallJavaTest < MiniTest::Spec
7
+
8
+ def test_simple
9
+ call = CallJava.new_java_call() do |s|
10
+ s.jar = 'hadoop-pie.jar'
11
+ end
12
+ assert_equal "hadoop jar hadoop-pie.jar", call.to_hadoop_cli.join(' ')
13
+ end
14
+
15
+ def test_env
16
+ call = CallJava.new_java_call() do |s|
17
+ s.jar = 'hadoop-pie.jar'
18
+ s.env 'abcd', 'efgh'
19
+ end
20
+ assert_equal "hadoop jar hadoop-pie.jar -cmdenv abcd=efgh", call.to_hadoop_cli.join(' ')
21
+ end
22
+
23
+ def test_conf
24
+ call = CallJava.new_java_call() do |s|
25
+ s.jar = 'hadoop-pie.jar'
26
+ s.conf 'abcd', 'efgh'
27
+ end
28
+ assert_equal "hadoop jar hadoop-pie.jar -Dabcd=efgh", call.to_hadoop_cli.join(' ')
29
+ end
30
+
31
+ def test_conf_multi
32
+ call = CallJava.new_java_call() do |s|
33
+ s.jar = 'hadoop-pie.jar'
34
+ s.conf 'abcd', 'efgh'
35
+ s.conf_concat 'abcd', 'ijkl'
36
+ s.conf_concat 'abcd', 'mnop'
37
+ end
38
+ assert_equal "hadoop jar hadoop-pie.jar -Dabcd=efgh -Dabcd=ijkl -Dabcd=mnop", call.to_hadoop_cli.join(' ')
39
+
40
+ call = CallJava.new_java_call() do |s|
41
+ s.jar = 'hadoop-pie.jar'
42
+ s.conf_concat 'abcd', 'mnop'
43
+ end
44
+ assert_equal "hadoop jar hadoop-pie.jar -Dabcd=mnop", call.to_hadoop_cli.join(' ')
45
+ end
46
+
47
+ def test_arg
48
+ call = CallJava.new_java_call() do
49
+ @jar = 'hadoop-pie.jar'
50
+ arg 'abcd', 'efgh'
51
+ end
52
+ assert_equal "hadoop jar hadoop-pie.jar abcd efgh", call.to_hadoop_cli.join(' ')
53
+ end
54
+
55
+ def test_archive
56
+ call = CallJava.new_java_call() do
57
+ @jar = 'hadoop-pie.jar'
58
+ archive 'dungeon', 'd'
59
+ end
60
+ assert_equal "hadoop jar hadoop-pie.jar -cacheArchive dungeon#d", call.to_hadoop_cli.join(' ')
61
+ end
62
+
63
+ def test_files
64
+ call = CallJava.new_java_call() do
65
+ @jar = 'hadoop-pie.jar'
66
+ file 'phile', 'f'
67
+ end
68
+ assert_equal "hadoop jar hadoop-pie.jar -cacheFile phile#f", call.to_hadoop_cli.join(' ')
69
+ end
70
+
71
+ def test_order
72
+ call = CallJava.new_java_call() do
73
+ @jar = 'hadoop-pie.jar'
74
+ @main_class = 'com.java.package.DoItLive'
75
+ env 'e1', 'dudio'
76
+ env 'e2', 'dudi-rio'
77
+ file 'phile', 'f'
78
+ conf 'c1', 'conf'
79
+ conf 'c2', 'cronf'
80
+ arg 'seriously'
81
+ archive 'dungeon', 'd'
82
+ file 'phile2', 'f2'
83
+ end
84
+ assert_equal "hadoop jar hadoop-pie.jar com.java.package.DoItLive -Dc1=conf -Dc2=cronf -cmdenv e1=dudio -cmdenv e2=dudi-rio -cacheFile phile#f -cacheFile phile2#f2 -cacheArchive dungeon#d seriously", call.to_hadoop_cli.join(' ')
85
+ end
86
+
87
+ def test_validation
88
+ assert_raises RuntimeError do
89
+ CallJava.new_java_call() do
90
+ @main_class = 'com.java.package.DoItLive'
91
+ end.to_hadoop_cli
92
+ end
93
+
94
+ assert_raises RuntimeError do
95
+ CallJava.new_java_call() do
96
+ @main_class = 'com.java.package.DoItLive'
97
+ end.to_h
98
+ end
99
+ end
100
+
101
+ def test_hash
102
+ call = CallJava.new_java_call() do
103
+ @jar = 'hadoop-pie.jar'
104
+ @main_class = 'com.java.package.DoItLive'
105
+ env 'e1', 'dudio'
106
+ env 'e2', 'dudi-rio'
107
+ file 'phile', 'f'
108
+ conf 'c1', 'conf'
109
+ conf 'c2', 'cronf'
110
+ arg 'seriously'
111
+ archive 'dungeon', 'd'
112
+ file 'phile', 'f'
113
+ @poopsy = 'poo'
114
+ end
115
+ assert_equal Hash[jar: "hadoop-pie.jar", archives: ["dungeon#d"],
116
+ main_class: "com.java.package.DoItLive", envs: {e1: "dudio", e2: "dudi-rio"},
117
+ files: ["phile#f", "phile#f"], confs: {c1: "conf", c2: "cronf"}, args: ["seriously"]], call.to_h
118
+ end
119
+
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,81 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+
6
+ class CallStreamingTest < MiniTest::Spec
7
+
8
+ def test_simple
9
+ streaming = CallStreaming.new_streaming_call do |s|
10
+ s.jar = 'hadoop-streaming.jar'
11
+ s.input = 'input'
12
+ s.output = 'output'
13
+ s.mapper = '/bin/cat'
14
+ s.reducer = '/bin/wc'
15
+ end
16
+ assert_equal "hadoop jar hadoop-streaming.jar -input input -output output -mapper /bin/cat -reducer /bin/wc", streaming.to_hadoop_cli.join(' ')
17
+ end
18
+
19
+ def test_hash
20
+ streaming = CallStreaming.new_streaming_call do |s|
21
+ s.jar = 'hadoop-streaming.jar'
22
+ s.inputformat = 'green'
23
+ s.input = 'input'
24
+ s.outputformat = 'blue'
25
+ s.output = 'output'
26
+ s.mapper = '/bin/cat'
27
+ s.reducer = '/bin/wc'
28
+ @test = 'noshow'
29
+ end
30
+ assert_equal Hash[jar: "hadoop-streaming.jar", main_class: nil, envs: nil, args: nil, confs: nil, files: nil, archives: nil,
31
+ mapper: "/bin/cat", reducer: "/bin/wc", inputformat: "green", input: "input", outputformat: "blue", output: "output"], streaming.to_h
32
+ end
33
+
34
+ def test_cli
35
+ streaming = CallStreaming.new_streaming_call do |s|
36
+ s.jar = 'hadoop-streaming.jar'
37
+ s.inputformat = 'green'
38
+ s.input = 'input'
39
+ s.outputformat = 'blue'
40
+ s.output = 'output'
41
+ s.mapper = '/bin/cat'
42
+ s.reducer = '/bin/wc'
43
+ @test = 'noshow'
44
+ end
45
+ assert_equal ["hadoop", "jar", "hadoop-streaming.jar", "-inputformat", "green", "-input", "input", "-outputformat", "blue",
46
+ "-output", "output", "-mapper", "/bin/cat", "-reducer", "/bin/wc"], streaming.to_hadoop_cli
47
+ end
48
+
49
+ def test_validation
50
+ assert_raises RuntimeError do
51
+ CallStreaming.new_streaming_call do |s|
52
+ s.jar = 'hadoop-streaming.jar'
53
+ s.input = 'input'
54
+ s.output = 'output'
55
+ s.reducer = '/bin/wc'
56
+ end.to_hadoop_cli
57
+ end
58
+
59
+ assert_raises RuntimeError do
60
+ CallStreaming.new_streaming_call() do
61
+ @jar = 'hadoop-streaming.jar'
62
+ @input = 'input'
63
+ @mapper = '/bin/cat'
64
+ @reducer = '/bin/wc'
65
+ end.to_h
66
+ end
67
+ end
68
+
69
+ def test_validation_inherited
70
+ assert_raises RuntimeError do
71
+ CallStreaming.new_streaming_call() do |s|
72
+ s.input = 'input'
73
+ s.output = 'output'
74
+ s.mapper = '/bin/cat'
75
+ s.reducer = '/bin/wc'
76
+ end.to_hadoop_cli
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end