rubadoop 0.7.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +7 -0
  3. data/Gemfile.lock +43 -0
  4. data/README.rdoc +7 -0
  5. data/Rakefile +25 -0
  6. data/lib/rubadoop.rb +26 -0
  7. data/lib/rubadoop/base_dsl.rb +31 -0
  8. data/lib/rubadoop/emr.rb +23 -0
  9. data/lib/rubadoop/emr/jobflow_builder.rb +36 -0
  10. data/lib/rubadoop/emr/jobflow_builder/bootstrap_action.rb +27 -0
  11. data/lib/rubadoop/emr/jobflow_builder/job_spec.rb +77 -0
  12. data/lib/rubadoop/emr/jobflow_builder/step.rb +41 -0
  13. data/lib/rubadoop/map_reduce.rb +23 -0
  14. data/lib/rubadoop/map_reduce/call_java.rb +112 -0
  15. data/lib/rubadoop/map_reduce/call_streaming.rb +55 -0
  16. data/lib/rubadoop/map_reduce/identity.rb +30 -0
  17. data/lib/rubadoop/map_reduce/io.rb +128 -0
  18. data/lib/rubadoop/map_reduce/job_conf_environment.rb +9 -0
  19. data/lib/rubadoop/map_reduce/mappable.rb +59 -0
  20. data/lib/rubadoop/map_reduce/mapper.rb +15 -0
  21. data/lib/rubadoop/map_reduce/reducable.rb +74 -0
  22. data/lib/rubadoop/map_reduce/reducer.rb +12 -0
  23. data/lib/rubadoop/map_reduce/test_assist.rb +65 -0
  24. data/lib/rubadoop/map_reduce/utils.rb +29 -0
  25. data/lib/rubadoop/oozie/workflow_builder.rb +42 -0
  26. data/lib/rubadoop/oozie/workflow_builder/job_properties.rb +19 -0
  27. data/lib/rubadoop/version.rb +3 -0
  28. data/test/rubadoop/base_dsl_test.rb +27 -0
  29. data/test/rubadoop/emr/jobflow_builder_test.rb +184 -0
  30. data/test/rubadoop/map_reduce/call_java_test.rb +122 -0
  31. data/test/rubadoop/map_reduce/call_streaming_test.rb +81 -0
  32. data/test/rubadoop/map_reduce/identity_test.rb +40 -0
  33. data/test/rubadoop/map_reduce/io_test.rb +51 -0
  34. data/test/rubadoop/map_reduce/job_conf_environment_test.rb +28 -0
  35. data/test/rubadoop/map_reduce/mappable_test.rb +62 -0
  36. data/test/rubadoop/map_reduce/mapper_test.rb +76 -0
  37. data/test/rubadoop/map_reduce/reducable_test.rb +12 -0
  38. data/test/rubadoop/map_reduce/reducer_test.rb +137 -0
  39. data/test/rubadoop/map_reduce/test_assist_test.rb +76 -0
  40. data/test/rubadoop/oozie/workflow_builder_test.rb +21 -0
  41. data/test/test_helper.rb +10 -0
  42. metadata +140 -0
@@ -0,0 +1,40 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+
6
+ class IdentityTest < MiniTest::Spec
7
+ def test_mapper
8
+ MapReduce.io_in = StringIO.new("abcd\nefgh")
9
+ output = StringIO.new
10
+ MapReduce.io_out = output
11
+ Identity::Mapper.new
12
+ assert_equal "abcd\nefgh\n", output.string
13
+ end
14
+
15
+ def test_mapper_ignorekey
16
+ MapReduce.io_in = StringIO.new("1\tabcd\n2\tefgh")
17
+ output = StringIO.new
18
+ MapReduce.io_out = output
19
+ Identity::Mapper.new(input_ignore_key: true)
20
+ assert_equal "abcd\nefgh\n", output.string
21
+ end
22
+
23
+ def test_reducer
24
+ MapReduce.io_in = StringIO.new("k1\tv1\nk2\tv2\n")
25
+ output = StringIO.new
26
+ MapReduce.io_out = output
27
+ Identity::Reducer.new
28
+ assert_equal "k1\tv1\nk2\tv2\n", output.string
29
+ end
30
+
31
+ def test_reducer_dummy
32
+ MapReduce.io_in = StringIO.new("abcd\nefgh")
33
+ output = StringIO.new
34
+ MapReduce.io_out = output
35
+ Identity::Reducer.new
36
+ assert_equal "abcd\nefgh\n", output.string
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,51 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+ class IoTest < MiniTest::Spec
6
+ include ::Rubadoop::MapReduce::Utils
7
+
8
+ def test_silent_output
9
+ ::Rubadoop::MapReduce::Io.set_silent_output
10
+
11
+ out_entry('silent')
12
+ end
13
+
14
+ def test_standard_output
15
+ ::Rubadoop::MapReduce::Io.set_standard_output
16
+
17
+ out_entry('standard')
18
+ end
19
+
20
+ def test_test_out
21
+ ::Rubadoop::MapReduce.out = TestOut.new
22
+
23
+ out_entry('poop')
24
+ log_counter 'Block', 'Rock', 1
25
+ log_counter 'Block', 'Stock', 0
26
+ log_counter 'Block', 'Rock', 5
27
+ out_map_entry('k1', 'v1')
28
+
29
+ assert_equal 6, ::Rubadoop::MapReduce.out.counters['Block']['Rock']
30
+ assert_equal 0, ::Rubadoop::MapReduce.out.counters['Block']['Stock']
31
+ assert_equal ['poop', "k1\tv1"], ::Rubadoop::MapReduce.out.entries
32
+ end
33
+
34
+ def test_counter_collection
35
+ [StandardOut, EmptyOut].each do |output_class|
36
+ ::Rubadoop::MapReduce.out = output_class.new
37
+
38
+ out_entry('poop')
39
+ log_counter 'Block', 'Rock', 1
40
+ log_counter 'Block', 'Stock', 0
41
+ log_counter 'Block', 'Rock', 5
42
+ out_map_entry('k1', 'v1')
43
+
44
+ assert_equal 6, ::Rubadoop::MapReduce.out.counters['Block']['Rock']
45
+ assert_equal 0, ::Rubadoop::MapReduce.out.counters['Block']['Stock']
46
+ end
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,28 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+ class JobConfEnvironmentTest < MiniTest::Spec
6
+ include ::Rubadoop::MapReduce::JobConfEnvironment
7
+
8
+ def test_simple
9
+
10
+ ENV['map_input_file'] = 'goop'
11
+
12
+ assert_equal 'goop', job_conf_environment('map.input.file')
13
+
14
+ ENV.delete 'map_input_file'
15
+ end
16
+
17
+ def test_missing
18
+
19
+ ENV['map.input.file'] = 'poop'
20
+
21
+ assert_equal nil, job_conf_environment('map.input.file')
22
+
23
+ ENV.delete 'map.input.file'
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,62 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+ class MappableTest < MiniTest::Spec
6
+ include TestAssist
7
+ include Mappable
8
+
9
+ def test_mapper
10
+ input = (1..500).to_a
11
+ result = run_test_mapper(input) do
12
+ mapper do |line|
13
+ out_entry line if line.to_i % 100 == 0
14
+ end
15
+ end
16
+ assert_equal 5, result.entries.size
17
+ assert_equal "100", result.entries[0]
18
+ assert_equal "200", result.entries[1]
19
+ assert_equal "300", result.entries[2]
20
+ assert_equal "400", result.entries[3]
21
+ assert_equal "500", result.entries[4]
22
+ end
23
+
24
+ def test_mapper_batched
25
+ input = (1..501).to_a
26
+ result = run_test_mapper(input) do
27
+ mapper_batched(100) do |batch|
28
+ out_entry batch
29
+ end
30
+ end
31
+ assert_equal 6, result.entries.size
32
+ assert_equal (1..100).to_a.map(&:to_s), result.entries[0]
33
+ assert_equal (101..200).to_a.map(&:to_s), result.entries[1]
34
+ assert_equal (201..300).to_a.map(&:to_s), result.entries[2]
35
+ assert_equal (301..400).to_a.map(&:to_s), result.entries[3]
36
+ assert_equal (401..500).to_a.map(&:to_s), result.entries[4]
37
+ assert_equal (501..501).to_a.map(&:to_s), result.entries[5]
38
+ end
39
+
40
+ def test_mapper_batched_edge
41
+ input = (1..10).to_a
42
+ result = run_test_mapper(input) do
43
+ mapper_batched(10) do |batch|
44
+ out_entry batch
45
+ end
46
+ end
47
+ assert_equal 1, result.entries.size
48
+ assert_equal (1..10).to_a.map(&:to_s), result.entries[0]
49
+ end
50
+
51
+ def test_mapper_batched_empty
52
+ result = run_test_mapper(nil) do
53
+ mapper_batched(10) do |batch|
54
+ out_entry batch
55
+ end
56
+ end
57
+ assert_equal 0, result.entries.size
58
+ end
59
+ end
60
+ end
61
+ end
62
+
@@ -0,0 +1,76 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+
6
+ class TestMapper < MapReduce::Mapper
7
+ end
8
+
9
+ class MapperTest < MiniTest::Spec
10
+
11
+ def test_split_block
12
+ mapper = TestMapper.new
13
+
14
+ MapReduce.io_in = StringIO.new("abcd\nefgh")
15
+ l = 0
16
+ mapper.process { |line|
17
+ case l
18
+ when 0; assert_equal "abcd", line
19
+ when 1; assert_equal "efgh", line
20
+ else; fail "unexpected line: #{l}=#{key}"
21
+ end
22
+ l += 1
23
+ }
24
+ assert_equal 2, l
25
+ end
26
+
27
+ def test_split_various
28
+ mapper = TestMapper.new
29
+
30
+ MapReduce.io_in = StringIO.new("abcd\nefgh")
31
+ assert_equal ["abcd", "efgh"], mapper.process
32
+
33
+ MapReduce.io_in = StringIO.new("abcd\n\n\n")
34
+ assert_equal ["abcd", "", ""], mapper.process
35
+
36
+ MapReduce.io_in = StringIO.new("0\n1")
37
+ assert_equal ["0", "1"], mapper.process
38
+
39
+ MapReduce.io_in = StringIO.new("0\n1\n")
40
+ assert_equal ["0", "1"], mapper.process
41
+
42
+ MapReduce.io_in = StringIO.new("0\n1\n ")
43
+ assert_equal ["0", "1", " "], mapper.process
44
+
45
+ MapReduce.io_in = StringIO.new("0\t1\n")
46
+ assert_equal ["0\t1"], mapper.process
47
+
48
+ MapReduce.io_in = StringIO.new("")
49
+ assert_equal [], mapper.process
50
+ end
51
+
52
+ def test_mapper_ignorekey
53
+ mapper = TestMapper.new(input_ignore_key: true)
54
+
55
+ MapReduce.io_in = StringIO.new("0\t1\n2\t3")
56
+ assert_equal ["1", "3"], mapper.process
57
+
58
+ MapReduce.io_in = StringIO.new("0\n1\n2\n3")
59
+ assert_equal ["0", "1", "2", "3"], mapper.process
60
+
61
+ MapReduce.io_in = StringIO.new("\t1\n2\t3\n")
62
+ assert_equal ["1", "3"], mapper.process
63
+ end
64
+
65
+ def test_mapper_function_name
66
+ mapper = TestMapper.new()
67
+
68
+ MapReduce.io_in = StringIO.new("0\n1\n2")
69
+ assert_equal ["0", "1", "2"], mapper.mapper
70
+
71
+ MapReduce.io_in = StringIO.new("0\n1\n2")
72
+ assert_equal ["0", "1", "2"], mapper.process
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,12 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+ class ReducableTest < MiniTest::Spec
6
+
7
+ #TODO
8
+
9
+ end
10
+ end
11
+ end
12
+
@@ -0,0 +1,137 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+
5
+ class TestReducer < MapReduce::Reducer
6
+ end
7
+
8
+ class ReducerTest < MiniTest::Spec
9
+
10
+ def test_splitting
11
+ reducer = TestReducer.new
12
+
13
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2")
14
+ l = 0
15
+ reducer.process { |key, values|
16
+ case l
17
+ when 0; assert_equal "1", key; assert_equal ["1", "2"], values.to_a
18
+ when 1; assert_equal "2", key; assert_equal ["2"], values.to_a
19
+ else; fail "unexpected line: #{l}=#{key}"
20
+ end
21
+ l += 1
22
+ }
23
+ assert_equal 2, l
24
+
25
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2")
26
+ r = reducer.process
27
+ assert_equal 2, r.size
28
+ assert_equal "1", r[0][:key]; assert_equal ["1", "2"], r[0][:values]
29
+ assert_equal "2", r[1][:key]; assert_equal ["2"], r[1][:values]
30
+
31
+ MapReduce.io_in = StringIO.new("1\t1")
32
+ r = reducer.process
33
+ assert_equal 1, r.size
34
+ assert_equal "1", r[0][:key]; assert_equal ["1"], r[0][:values]
35
+
36
+ MapReduce.io_in = StringIO.new("")
37
+ r = reducer.process
38
+ assert_equal 0, r.size
39
+
40
+ MapReduce.io_in = StringIO.new("abcd\nefgh")
41
+ r = reducer.process
42
+ assert_equal 2, r.size
43
+ assert_equal "abcd", r[0][:key]; assert_equal [nil], r[0][:values]
44
+ assert_equal "efgh", r[1][:key]; assert_equal [nil], r[1][:values]
45
+
46
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2")
47
+ r = reducer.process
48
+ assert_equal 2, r.size
49
+ assert_equal "1", r[0][:key]; assert_equal ["1", "2"], r[0][:values]
50
+ assert_equal "2", r[1][:key]; assert_equal ["2"], r[1][:values]
51
+
52
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
53
+ r = reducer.process
54
+ assert_equal 4, r.size
55
+ assert_equal "1", r[0][:key]; assert_equal ["1", "2"], r[0][:values]
56
+ assert_equal "2", r[1][:key]; assert_equal ["2"], r[1][:values]
57
+ assert_equal "3", r[2][:key]; assert_equal ["3", "1", "9"], r[2][:values]
58
+ assert_equal "4", r[3][:key]; assert_equal ["1"], r[3][:values]
59
+
60
+ end
61
+
62
+ def test_skipping
63
+ reducer = TestReducer.new
64
+
65
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
66
+ l = 0
67
+ reducer.process { |key, values|
68
+ case l
69
+ when 0; assert_equal "1", key; assert_equal ["1", "2"], values.to_a
70
+ when 1; assert_equal "2", key; assert_equal ["2"], values.to_a
71
+ when 2; assert_equal "3", key; #assert_equal ["3", "1", "9"], values.to_a
72
+ when 3; assert_equal "4", key; assert_equal ["1"], values.to_a
73
+ else; fail "unexpected line: #{l}=#{key}"
74
+ end
75
+ l += 1
76
+ }
77
+ assert_equal 4, l
78
+
79
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
80
+ l = 0
81
+ reducer.process { |key, values|
82
+ case l
83
+ when 0; assert_equal "1", key; assert_equal ["1", "2"], values.to_a
84
+ when 1; assert_equal "2", key; assert_equal ["2"], values.to_a
85
+ when 2; assert_equal "3", key; assert_equal "3", values.next()
86
+ when 3; assert_equal "4", key; assert_equal ["1"], values.to_a
87
+ else; fail "unexpected line: #{l}=#{key}"
88
+ end
89
+ l += 1
90
+ }
91
+ assert_equal 4, l
92
+
93
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
94
+ l = 0
95
+ reducer.process { |key, values|
96
+ case l
97
+ when 0; assert_equal "1", key;
98
+ when 1; assert_equal "2", key; assert_equal ["2"], values.to_a
99
+ when 2; assert_equal "3", key;
100
+ when 3; assert_equal "4", key; assert_equal ["1"], values.to_a
101
+ else; fail "unexpected line: #{l}=#{key}"
102
+ end
103
+ l += 1
104
+ }
105
+ assert_equal 4, l
106
+
107
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
108
+ l = 0
109
+ reducer.process { |key, values|
110
+ case l
111
+ when 0; assert_equal "1", key; assert_equal ["1", "2"], values.to_a
112
+ when 1; assert_equal "2", key; assert_equal ["2"], values.to_a
113
+ when 2; assert_equal "3", key; #assert_equal ["3", "1", "9"], values.to_a
114
+ when 3; assert_equal "4", key; #assert_equal ["1"], values.to_a
115
+ else; fail "unexpected line: #{l}=#{key}"
116
+ end
117
+ l += 1
118
+ }
119
+ assert_equal 4, l
120
+
121
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
122
+ l = 0
123
+ reducer.process { |key, values|
124
+ case l
125
+ when 0; assert_equal "1", key; #assert_equal ["1", "2"], values.to_a
126
+ when 1; assert_equal "2", key; #assert_equal ["2"], values.to_a
127
+ when 2; assert_equal "3", key; #assert_equal ["3", "1", "9"], values.to_a
128
+ when 3; assert_equal "4", key; #assert_equal ["1"], values.to_a
129
+ else; fail "unexpected line: #{l}=#{key}"
130
+ end
131
+ l += 1
132
+ }
133
+ assert_equal 4, l
134
+ end
135
+ end
136
+ end
137
+
@@ -0,0 +1,76 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+ class TestAssistTest < MiniTest::Spec
6
+ include ::Rubadoop::MapReduce::TestAssist
7
+
8
+ def test_wordcount_mapper
9
+ input = ['The quick brown fox', 'jumped over', 'the lazy dog']
10
+ result = run_test_mapper(input) do
11
+ SimpleWordCount.new.execute
12
+ end
13
+ assert_equal 8, result.entries.size
14
+ assert_equal "the\t2", result.entries[0]
15
+ assert_equal "dog\t1", result.entries[7]
16
+ end
17
+
18
+ def test_summing_reducer
19
+ input = Hash['the' => [3, 4, 5], 'fox' => [5]]
20
+ result = run_test_reducer(input) do
21
+ SummingReducer.new.execute
22
+ end
23
+ assert_equal 2, result.entries.size
24
+ assert_equal "the\t12", result.entries[0]
25
+ assert_equal "fox\t5", result.entries[1]
26
+ end
27
+
28
+ def test_no_input
29
+ result = run_test_mapper(nil) do
30
+ ::Rubadoop::MapReduce.out.entry('cookoo')
31
+ end
32
+ assert_equal 1, result.entries.size
33
+ assert_equal "cookoo", result.entries[0]
34
+ end
35
+
36
+ end
37
+
38
+ class SimpleWordCount
39
+ include ::Rubadoop::MapReduce::Utils
40
+ include ::Rubadoop::MapReduce::Mappable
41
+
42
+ def execute
43
+ words = Hash[]
44
+
45
+ mapper do |line|
46
+ line.split(/\s+/).each do |word|
47
+ word.downcase!
48
+ if words[word]
49
+ words[word] += 1
50
+ else
51
+ words[word] = 1
52
+ end
53
+ end
54
+ end
55
+
56
+ words.each do |word, count|
57
+ out_map_entry(word, count)
58
+ end
59
+ end
60
+ end
61
+
62
+ class SummingReducer
63
+ include ::Rubadoop::MapReduce::Utils
64
+ include ::Rubadoop::MapReduce::Reducable
65
+
66
+ def execute
67
+ reducer do |key, counts|
68
+ sum = 0
69
+ counts.each { |c| sum += c.to_i }
70
+ out_map_entry(key, sum)
71
+ end
72
+ end
73
+ end
74
+
75
+ end
76
+ end