rubadoop 0.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +7 -0
  3. data/Gemfile.lock +43 -0
  4. data/README.rdoc +7 -0
  5. data/Rakefile +25 -0
  6. data/lib/rubadoop.rb +26 -0
  7. data/lib/rubadoop/base_dsl.rb +31 -0
  8. data/lib/rubadoop/emr.rb +23 -0
  9. data/lib/rubadoop/emr/jobflow_builder.rb +36 -0
  10. data/lib/rubadoop/emr/jobflow_builder/bootstrap_action.rb +27 -0
  11. data/lib/rubadoop/emr/jobflow_builder/job_spec.rb +77 -0
  12. data/lib/rubadoop/emr/jobflow_builder/step.rb +41 -0
  13. data/lib/rubadoop/map_reduce.rb +23 -0
  14. data/lib/rubadoop/map_reduce/call_java.rb +112 -0
  15. data/lib/rubadoop/map_reduce/call_streaming.rb +55 -0
  16. data/lib/rubadoop/map_reduce/identity.rb +30 -0
  17. data/lib/rubadoop/map_reduce/io.rb +128 -0
  18. data/lib/rubadoop/map_reduce/job_conf_environment.rb +9 -0
  19. data/lib/rubadoop/map_reduce/mappable.rb +59 -0
  20. data/lib/rubadoop/map_reduce/mapper.rb +15 -0
  21. data/lib/rubadoop/map_reduce/reducable.rb +74 -0
  22. data/lib/rubadoop/map_reduce/reducer.rb +12 -0
  23. data/lib/rubadoop/map_reduce/test_assist.rb +65 -0
  24. data/lib/rubadoop/map_reduce/utils.rb +29 -0
  25. data/lib/rubadoop/oozie/workflow_builder.rb +42 -0
  26. data/lib/rubadoop/oozie/workflow_builder/job_properties.rb +19 -0
  27. data/lib/rubadoop/version.rb +3 -0
  28. data/test/rubadoop/base_dsl_test.rb +27 -0
  29. data/test/rubadoop/emr/jobflow_builder_test.rb +184 -0
  30. data/test/rubadoop/map_reduce/call_java_test.rb +122 -0
  31. data/test/rubadoop/map_reduce/call_streaming_test.rb +81 -0
  32. data/test/rubadoop/map_reduce/identity_test.rb +40 -0
  33. data/test/rubadoop/map_reduce/io_test.rb +51 -0
  34. data/test/rubadoop/map_reduce/job_conf_environment_test.rb +28 -0
  35. data/test/rubadoop/map_reduce/mappable_test.rb +62 -0
  36. data/test/rubadoop/map_reduce/mapper_test.rb +76 -0
  37. data/test/rubadoop/map_reduce/reducable_test.rb +12 -0
  38. data/test/rubadoop/map_reduce/reducer_test.rb +137 -0
  39. data/test/rubadoop/map_reduce/test_assist_test.rb +76 -0
  40. data/test/rubadoop/oozie/workflow_builder_test.rb +21 -0
  41. data/test/test_helper.rb +10 -0
  42. metadata +140 -0
@@ -0,0 +1,40 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+
6
+ class IdentityTest < MiniTest::Spec
7
+ def test_mapper
8
+ MapReduce.io_in = StringIO.new("abcd\nefgh")
9
+ output = StringIO.new
10
+ MapReduce.io_out = output
11
+ Identity::Mapper.new
12
+ assert_equal "abcd\nefgh\n", output.string
13
+ end
14
+
15
+ def test_mapper_ignorekey
16
+ MapReduce.io_in = StringIO.new("1\tabcd\n2\tefgh")
17
+ output = StringIO.new
18
+ MapReduce.io_out = output
19
+ Identity::Mapper.new(input_ignore_key: true)
20
+ assert_equal "abcd\nefgh\n", output.string
21
+ end
22
+
23
+ def test_reducer
24
+ MapReduce.io_in = StringIO.new("k1\tv1\nk2\tv2\n")
25
+ output = StringIO.new
26
+ MapReduce.io_out = output
27
+ Identity::Reducer.new
28
+ assert_equal "k1\tv1\nk2\tv2\n", output.string
29
+ end
30
+
31
+ def test_reducer_dummy
32
+ MapReduce.io_in = StringIO.new("abcd\nefgh")
33
+ output = StringIO.new
34
+ MapReduce.io_out = output
35
+ Identity::Reducer.new
36
+ assert_equal "abcd\nefgh\n", output.string
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,51 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+ class IoTest < MiniTest::Spec
6
+ include ::Rubadoop::MapReduce::Utils
7
+
8
+ def test_silent_output
9
+ ::Rubadoop::MapReduce::Io.set_silent_output
10
+
11
+ out_entry('silent')
12
+ end
13
+
14
+ def test_standard_output
15
+ ::Rubadoop::MapReduce::Io.set_standard_output
16
+
17
+ out_entry('standard')
18
+ end
19
+
20
+ def test_test_out
21
+ ::Rubadoop::MapReduce.out = TestOut.new
22
+
23
+ out_entry('poop')
24
+ log_counter 'Block', 'Rock', 1
25
+ log_counter 'Block', 'Stock', 0
26
+ log_counter 'Block', 'Rock', 5
27
+ out_map_entry('k1', 'v1')
28
+
29
+ assert_equal 6, ::Rubadoop::MapReduce.out.counters['Block']['Rock']
30
+ assert_equal 0, ::Rubadoop::MapReduce.out.counters['Block']['Stock']
31
+ assert_equal ['poop', "k1\tv1"], ::Rubadoop::MapReduce.out.entries
32
+ end
33
+
34
+ def test_counter_collection
35
+ [StandardOut, EmptyOut].each do |output_class|
36
+ ::Rubadoop::MapReduce.out = output_class.new
37
+
38
+ out_entry('poop')
39
+ log_counter 'Block', 'Rock', 1
40
+ log_counter 'Block', 'Stock', 0
41
+ log_counter 'Block', 'Rock', 5
42
+ out_map_entry('k1', 'v1')
43
+
44
+ assert_equal 6, ::Rubadoop::MapReduce.out.counters['Block']['Rock']
45
+ assert_equal 0, ::Rubadoop::MapReduce.out.counters['Block']['Stock']
46
+ end
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,28 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+ class JobConfEnvironmentTest < MiniTest::Spec
6
+ include ::Rubadoop::MapReduce::JobConfEnvironment
7
+
8
+ def test_simple
9
+
10
+ ENV['map_input_file'] = 'goop'
11
+
12
+ assert_equal 'goop', job_conf_environment('map.input.file')
13
+
14
+ ENV.delete 'map_input_file'
15
+ end
16
+
17
+ def test_missing
18
+
19
+ ENV['map.input.file'] = 'poop'
20
+
21
+ assert_equal nil, job_conf_environment('map.input.file')
22
+
23
+ ENV.delete 'map.input.file'
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,62 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+ class MappableTest < MiniTest::Spec
6
+ include TestAssist
7
+ include Mappable
8
+
9
+ def test_mapper
10
+ input = (1..500).to_a
11
+ result = run_test_mapper(input) do
12
+ mapper do |line|
13
+ out_entry line if line.to_i % 100 == 0
14
+ end
15
+ end
16
+ assert_equal 5, result.entries.size
17
+ assert_equal "100", result.entries[0]
18
+ assert_equal "200", result.entries[1]
19
+ assert_equal "300", result.entries[2]
20
+ assert_equal "400", result.entries[3]
21
+ assert_equal "500", result.entries[4]
22
+ end
23
+
24
+ def test_mapper_batched
25
+ input = (1..501).to_a
26
+ result = run_test_mapper(input) do
27
+ mapper_batched(100) do |batch|
28
+ out_entry batch
29
+ end
30
+ end
31
+ assert_equal 6, result.entries.size
32
+ assert_equal (1..100).to_a.map(&:to_s), result.entries[0]
33
+ assert_equal (101..200).to_a.map(&:to_s), result.entries[1]
34
+ assert_equal (201..300).to_a.map(&:to_s), result.entries[2]
35
+ assert_equal (301..400).to_a.map(&:to_s), result.entries[3]
36
+ assert_equal (401..500).to_a.map(&:to_s), result.entries[4]
37
+ assert_equal (501..501).to_a.map(&:to_s), result.entries[5]
38
+ end
39
+
40
+ def test_mapper_batched_edge
41
+ input = (1..10).to_a
42
+ result = run_test_mapper(input) do
43
+ mapper_batched(10) do |batch|
44
+ out_entry batch
45
+ end
46
+ end
47
+ assert_equal 1, result.entries.size
48
+ assert_equal (1..10).to_a.map(&:to_s), result.entries[0]
49
+ end
50
+
51
+ def test_mapper_batched_empty
52
+ result = run_test_mapper(nil) do
53
+ mapper_batched(10) do |batch|
54
+ out_entry batch
55
+ end
56
+ end
57
+ assert_equal 0, result.entries.size
58
+ end
59
+ end
60
+ end
61
+ end
62
+
@@ -0,0 +1,76 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+
6
+ class TestMapper < MapReduce::Mapper
7
+ end
8
+
9
+ class MapperTest < MiniTest::Spec
10
+
11
+ def test_split_block
12
+ mapper = TestMapper.new
13
+
14
+ MapReduce.io_in = StringIO.new("abcd\nefgh")
15
+ l = 0
16
+ mapper.process { |line|
17
+ case l
18
+ when 0; assert_equal "abcd", line
19
+ when 1; assert_equal "efgh", line
20
+ else; fail "unexpected line: #{l}=#{key}"
21
+ end
22
+ l += 1
23
+ }
24
+ assert_equal 2, l
25
+ end
26
+
27
+ def test_split_various
28
+ mapper = TestMapper.new
29
+
30
+ MapReduce.io_in = StringIO.new("abcd\nefgh")
31
+ assert_equal ["abcd", "efgh"], mapper.process
32
+
33
+ MapReduce.io_in = StringIO.new("abcd\n\n\n")
34
+ assert_equal ["abcd", "", ""], mapper.process
35
+
36
+ MapReduce.io_in = StringIO.new("0\n1")
37
+ assert_equal ["0", "1"], mapper.process
38
+
39
+ MapReduce.io_in = StringIO.new("0\n1\n")
40
+ assert_equal ["0", "1"], mapper.process
41
+
42
+ MapReduce.io_in = StringIO.new("0\n1\n ")
43
+ assert_equal ["0", "1", " "], mapper.process
44
+
45
+ MapReduce.io_in = StringIO.new("0\t1\n")
46
+ assert_equal ["0\t1"], mapper.process
47
+
48
+ MapReduce.io_in = StringIO.new("")
49
+ assert_equal [], mapper.process
50
+ end
51
+
52
+ def test_mapper_ignorekey
53
+ mapper = TestMapper.new(input_ignore_key: true)
54
+
55
+ MapReduce.io_in = StringIO.new("0\t1\n2\t3")
56
+ assert_equal ["1", "3"], mapper.process
57
+
58
+ MapReduce.io_in = StringIO.new("0\n1\n2\n3")
59
+ assert_equal ["0", "1", "2", "3"], mapper.process
60
+
61
+ MapReduce.io_in = StringIO.new("\t1\n2\t3\n")
62
+ assert_equal ["1", "3"], mapper.process
63
+ end
64
+
65
+ def test_mapper_function_name
66
+ mapper = TestMapper.new()
67
+
68
+ MapReduce.io_in = StringIO.new("0\n1\n2")
69
+ assert_equal ["0", "1", "2"], mapper.mapper
70
+
71
+ MapReduce.io_in = StringIO.new("0\n1\n2")
72
+ assert_equal ["0", "1", "2"], mapper.process
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,12 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+ class ReducableTest < MiniTest::Spec
6
+
7
+ #TODO
8
+
9
+ end
10
+ end
11
+ end
12
+
@@ -0,0 +1,137 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+
5
+ class TestReducer < MapReduce::Reducer
6
+ end
7
+
8
+ class ReducerTest < MiniTest::Spec
9
+
10
+ def test_splitting
11
+ reducer = TestReducer.new
12
+
13
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2")
14
+ l = 0
15
+ reducer.process { |key, values|
16
+ case l
17
+ when 0; assert_equal "1", key; assert_equal ["1", "2"], values.to_a
18
+ when 1; assert_equal "2", key; assert_equal ["2"], values.to_a
19
+ else; fail "unexpected line: #{l}=#{key}"
20
+ end
21
+ l += 1
22
+ }
23
+ assert_equal 2, l
24
+
25
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2")
26
+ r = reducer.process
27
+ assert_equal 2, r.size
28
+ assert_equal "1", r[0][:key]; assert_equal ["1", "2"], r[0][:values]
29
+ assert_equal "2", r[1][:key]; assert_equal ["2"], r[1][:values]
30
+
31
+ MapReduce.io_in = StringIO.new("1\t1")
32
+ r = reducer.process
33
+ assert_equal 1, r.size
34
+ assert_equal "1", r[0][:key]; assert_equal ["1"], r[0][:values]
35
+
36
+ MapReduce.io_in = StringIO.new("")
37
+ r = reducer.process
38
+ assert_equal 0, r.size
39
+
40
+ MapReduce.io_in = StringIO.new("abcd\nefgh")
41
+ r = reducer.process
42
+ assert_equal 2, r.size
43
+ assert_equal "abcd", r[0][:key]; assert_equal [nil], r[0][:values]
44
+ assert_equal "efgh", r[1][:key]; assert_equal [nil], r[1][:values]
45
+
46
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2")
47
+ r = reducer.process
48
+ assert_equal 2, r.size
49
+ assert_equal "1", r[0][:key]; assert_equal ["1", "2"], r[0][:values]
50
+ assert_equal "2", r[1][:key]; assert_equal ["2"], r[1][:values]
51
+
52
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
53
+ r = reducer.process
54
+ assert_equal 4, r.size
55
+ assert_equal "1", r[0][:key]; assert_equal ["1", "2"], r[0][:values]
56
+ assert_equal "2", r[1][:key]; assert_equal ["2"], r[1][:values]
57
+ assert_equal "3", r[2][:key]; assert_equal ["3", "1", "9"], r[2][:values]
58
+ assert_equal "4", r[3][:key]; assert_equal ["1"], r[3][:values]
59
+
60
+ end
61
+
62
+ def test_skipping
63
+ reducer = TestReducer.new
64
+
65
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
66
+ l = 0
67
+ reducer.process { |key, values|
68
+ case l
69
+ when 0; assert_equal "1", key; assert_equal ["1", "2"], values.to_a
70
+ when 1; assert_equal "2", key; assert_equal ["2"], values.to_a
71
+ when 2; assert_equal "3", key; #assert_equal ["3", "1", "9"], values.to_a
72
+ when 3; assert_equal "4", key; assert_equal ["1"], values.to_a
73
+ else; fail "unexpected line: #{l}=#{key}"
74
+ end
75
+ l += 1
76
+ }
77
+ assert_equal 4, l
78
+
79
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
80
+ l = 0
81
+ reducer.process { |key, values|
82
+ case l
83
+ when 0; assert_equal "1", key; assert_equal ["1", "2"], values.to_a
84
+ when 1; assert_equal "2", key; assert_equal ["2"], values.to_a
85
+ when 2; assert_equal "3", key; assert_equal "3", values.next()
86
+ when 3; assert_equal "4", key; assert_equal ["1"], values.to_a
87
+ else; fail "unexpected line: #{l}=#{key}"
88
+ end
89
+ l += 1
90
+ }
91
+ assert_equal 4, l
92
+
93
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
94
+ l = 0
95
+ reducer.process { |key, values|
96
+ case l
97
+ when 0; assert_equal "1", key;
98
+ when 1; assert_equal "2", key; assert_equal ["2"], values.to_a
99
+ when 2; assert_equal "3", key;
100
+ when 3; assert_equal "4", key; assert_equal ["1"], values.to_a
101
+ else; fail "unexpected line: #{l}=#{key}"
102
+ end
103
+ l += 1
104
+ }
105
+ assert_equal 4, l
106
+
107
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
108
+ l = 0
109
+ reducer.process { |key, values|
110
+ case l
111
+ when 0; assert_equal "1", key; assert_equal ["1", "2"], values.to_a
112
+ when 1; assert_equal "2", key; assert_equal ["2"], values.to_a
113
+ when 2; assert_equal "3", key; #assert_equal ["3", "1", "9"], values.to_a
114
+ when 3; assert_equal "4", key; #assert_equal ["1"], values.to_a
115
+ else; fail "unexpected line: #{l}=#{key}"
116
+ end
117
+ l += 1
118
+ }
119
+ assert_equal 4, l
120
+
121
+ MapReduce.io_in = StringIO.new("1\t1\n1\t2\n2\t2\n3\t3\n3\t1\n3\t9\n4\t1")
122
+ l = 0
123
+ reducer.process { |key, values|
124
+ case l
125
+ when 0; assert_equal "1", key; #assert_equal ["1", "2"], values.to_a
126
+ when 1; assert_equal "2", key; #assert_equal ["2"], values.to_a
127
+ when 2; assert_equal "3", key; #assert_equal ["3", "1", "9"], values.to_a
128
+ when 3; assert_equal "4", key; #assert_equal ["1"], values.to_a
129
+ else; fail "unexpected line: #{l}=#{key}"
130
+ end
131
+ l += 1
132
+ }
133
+ assert_equal 4, l
134
+ end
135
+ end
136
+ end
137
+
@@ -0,0 +1,76 @@
1
+ require 'test_helper'
2
+
3
+ module Rubadoop
4
+ module MapReduce
5
+ class TestAssistTest < MiniTest::Spec
6
+ include ::Rubadoop::MapReduce::TestAssist
7
+
8
+ def test_wordcount_mapper
9
+ input = ['The quick brown fox', 'jumped over', 'the lazy dog']
10
+ result = run_test_mapper(input) do
11
+ SimpleWordCount.new.execute
12
+ end
13
+ assert_equal 8, result.entries.size
14
+ assert_equal "the\t2", result.entries[0]
15
+ assert_equal "dog\t1", result.entries[7]
16
+ end
17
+
18
+ def test_summing_reducer
19
+ input = Hash['the' => [3, 4, 5], 'fox' => [5]]
20
+ result = run_test_reducer(input) do
21
+ SummingReducer.new.execute
22
+ end
23
+ assert_equal 2, result.entries.size
24
+ assert_equal "the\t12", result.entries[0]
25
+ assert_equal "fox\t5", result.entries[1]
26
+ end
27
+
28
+ def test_no_input
29
+ result = run_test_mapper(nil) do
30
+ ::Rubadoop::MapReduce.out.entry('cookoo')
31
+ end
32
+ assert_equal 1, result.entries.size
33
+ assert_equal "cookoo", result.entries[0]
34
+ end
35
+
36
+ end
37
+
38
+ class SimpleWordCount
39
+ include ::Rubadoop::MapReduce::Utils
40
+ include ::Rubadoop::MapReduce::Mappable
41
+
42
+ def execute
43
+ words = Hash[]
44
+
45
+ mapper do |line|
46
+ line.split(/\s+/).each do |word|
47
+ word.downcase!
48
+ if words[word]
49
+ words[word] += 1
50
+ else
51
+ words[word] = 1
52
+ end
53
+ end
54
+ end
55
+
56
+ words.each do |word, count|
57
+ out_map_entry(word, count)
58
+ end
59
+ end
60
+ end
61
+
62
+ class SummingReducer
63
+ include ::Rubadoop::MapReduce::Utils
64
+ include ::Rubadoop::MapReduce::Reducable
65
+
66
+ def execute
67
+ reducer do |key, counts|
68
+ sum = 0
69
+ counts.each { |c| sum += c.to_i }
70
+ out_map_entry(key, sum)
71
+ end
72
+ end
73
+ end
74
+
75
+ end
76
+ end