rubadoop 0.7.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +43 -0
- data/README.rdoc +7 -0
- data/Rakefile +25 -0
- data/lib/rubadoop.rb +26 -0
- data/lib/rubadoop/base_dsl.rb +31 -0
- data/lib/rubadoop/emr.rb +23 -0
- data/lib/rubadoop/emr/jobflow_builder.rb +36 -0
- data/lib/rubadoop/emr/jobflow_builder/bootstrap_action.rb +27 -0
- data/lib/rubadoop/emr/jobflow_builder/job_spec.rb +77 -0
- data/lib/rubadoop/emr/jobflow_builder/step.rb +41 -0
- data/lib/rubadoop/map_reduce.rb +23 -0
- data/lib/rubadoop/map_reduce/call_java.rb +112 -0
- data/lib/rubadoop/map_reduce/call_streaming.rb +55 -0
- data/lib/rubadoop/map_reduce/identity.rb +30 -0
- data/lib/rubadoop/map_reduce/io.rb +128 -0
- data/lib/rubadoop/map_reduce/job_conf_environment.rb +9 -0
- data/lib/rubadoop/map_reduce/mappable.rb +59 -0
- data/lib/rubadoop/map_reduce/mapper.rb +15 -0
- data/lib/rubadoop/map_reduce/reducable.rb +74 -0
- data/lib/rubadoop/map_reduce/reducer.rb +12 -0
- data/lib/rubadoop/map_reduce/test_assist.rb +65 -0
- data/lib/rubadoop/map_reduce/utils.rb +29 -0
- data/lib/rubadoop/oozie/workflow_builder.rb +42 -0
- data/lib/rubadoop/oozie/workflow_builder/job_properties.rb +19 -0
- data/lib/rubadoop/version.rb +3 -0
- data/test/rubadoop/base_dsl_test.rb +27 -0
- data/test/rubadoop/emr/jobflow_builder_test.rb +184 -0
- data/test/rubadoop/map_reduce/call_java_test.rb +122 -0
- data/test/rubadoop/map_reduce/call_streaming_test.rb +81 -0
- data/test/rubadoop/map_reduce/identity_test.rb +40 -0
- data/test/rubadoop/map_reduce/io_test.rb +51 -0
- data/test/rubadoop/map_reduce/job_conf_environment_test.rb +28 -0
- data/test/rubadoop/map_reduce/mappable_test.rb +62 -0
- data/test/rubadoop/map_reduce/mapper_test.rb +76 -0
- data/test/rubadoop/map_reduce/reducable_test.rb +12 -0
- data/test/rubadoop/map_reduce/reducer_test.rb +137 -0
- data/test/rubadoop/map_reduce/test_assist_test.rb +76 -0
- data/test/rubadoop/oozie/workflow_builder_test.rb +21 -0
- data/test/test_helper.rb +10 -0
- metadata +140 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module MapReduce
|
3
|
+
class CallStreaming < CallJava
|
4
|
+
attr_accessor :mapper, :reducer, :input, :output, :inputformat, :outputformat, :combiner
|
5
|
+
|
6
|
+
class << self
|
7
|
+
def new_streaming_call(params = {}, &block)
|
8
|
+
builder = CallStreaming.new(params)
|
9
|
+
if block_given?
|
10
|
+
if block.arity == 1
|
11
|
+
yield builder
|
12
|
+
else
|
13
|
+
builder.instance_eval &block
|
14
|
+
end
|
15
|
+
end
|
16
|
+
builder
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def initialize(params = {})
|
21
|
+
super(params)
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_hadoop_cli(opts = {})
|
25
|
+
cmd = super(opts.merge(skip_args: true))
|
26
|
+
cmd.concat ['-inputformat', "#{@inputformat}"] if @inputformat
|
27
|
+
cmd.concat ['-input', "#{@input}"] if @input
|
28
|
+
cmd.concat ['-outputformat', "#{@outputformat}"] if @outputformat
|
29
|
+
cmd.concat ['-output', "#{@output}"] if @output
|
30
|
+
cmd.concat ['-mapper', "#{@mapper}"] if @mapper
|
31
|
+
cmd.concat ['-reducer', "#{@reducer}"] if @reducer
|
32
|
+
cmd.concat ['-combiner', "#{@combiner}"] if @combiner
|
33
|
+
cmd.concat @args if @args
|
34
|
+
cmd
|
35
|
+
end
|
36
|
+
|
37
|
+
def to_h
|
38
|
+
built = super
|
39
|
+
[:@mapper, :@reducer, :@combiner, :@inputformat, :@input, :@outputformat, :@output].each { |entry|
|
40
|
+
value = instance_variable_get(entry)
|
41
|
+
built[entry.to_s.delete("@").to_sym] = value unless value.nil?
|
42
|
+
}
|
43
|
+
built
|
44
|
+
end
|
45
|
+
|
46
|
+
def validate
|
47
|
+
super
|
48
|
+
[:@mapper, :@reducer, :@input, :@output].each { |property|
|
49
|
+
raise "Missing #{property}" if instance_variable_get(property).nil?
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module MapReduce
|
3
|
+
module Identity
|
4
|
+
|
5
|
+
class Mapper < MapReduce::Mapper
|
6
|
+
def initialize(config = {})
|
7
|
+
super(config)
|
8
|
+
|
9
|
+
mapper do |line|
|
10
|
+
MapReduce.out.entry(line)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Reducer < MapReduce::Reducer
|
16
|
+
def initialize(config = {})
|
17
|
+
super(config)
|
18
|
+
|
19
|
+
reducer do |key, values|
|
20
|
+
values.each { |value|
|
21
|
+
MapReduce.out.entry "#{key}\t#{value}" unless value.nil?
|
22
|
+
MapReduce.out.entry key if value.nil?
|
23
|
+
}
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module MapReduce
|
3
|
+
module Io
|
4
|
+
attr_writer :io_in, :io_out, :io_err
|
5
|
+
attr_writer :out
|
6
|
+
|
7
|
+
def out
|
8
|
+
@out ||= HadoopOut.new
|
9
|
+
end
|
10
|
+
def io_in; @io_in || STDIN end
|
11
|
+
def io_out; @io_out || STDOUT end
|
12
|
+
def io_err; @io_err || STDERR end
|
13
|
+
|
14
|
+
class << self
|
15
|
+
def set_silent_output
|
16
|
+
::Rubadoop::MapReduce.out = EmptyOut.new
|
17
|
+
end
|
18
|
+
def set_standard_output
|
19
|
+
::Rubadoop::MapReduce.out = StandardOut.new
|
20
|
+
end
|
21
|
+
def set_rails_output
|
22
|
+
::Rubadoop::MapReduce.out = RailsOut.new
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
class CollectCounters
|
31
|
+
def counters
|
32
|
+
@counters ||= Hash[]
|
33
|
+
end
|
34
|
+
|
35
|
+
def counter(group, counter, amount)
|
36
|
+
c_group = counters[group] ||= Hash[]
|
37
|
+
if c_group[counter]
|
38
|
+
c_group[counter] += amount
|
39
|
+
else
|
40
|
+
c_group[counter] = amount
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
public
|
46
|
+
|
47
|
+
class HadoopOut
|
48
|
+
def counter(group, counter, amount)
|
49
|
+
MapReduce.io_err.puts "reporter:counter:#{group},#{counter},#{amount}"
|
50
|
+
end
|
51
|
+
def status(status)
|
52
|
+
MapReduce.io_err.puts "reporter:status:#{status}"
|
53
|
+
end
|
54
|
+
def map_entry(key, value)
|
55
|
+
MapReduce.io_out.puts "#{key}\t#{value}"
|
56
|
+
end
|
57
|
+
def entry(value)
|
58
|
+
MapReduce.io_out.puts value
|
59
|
+
end
|
60
|
+
def error(message)
|
61
|
+
MapReduce.io_err.puts message
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class StandardOut < CollectCounters
|
66
|
+
def counter(group, counter, amount)
|
67
|
+
super
|
68
|
+
MapReduce.io_out.puts "Counter: #{group}.#{counter} +#{amount}"
|
69
|
+
end
|
70
|
+
def status(status)
|
71
|
+
MapReduce.io_out.puts "reporter:status:#{status}"
|
72
|
+
end
|
73
|
+
def map_entry(key, value)
|
74
|
+
MapReduce.io_out.puts "MapEntry: #{key}\t#{value}"
|
75
|
+
end
|
76
|
+
def entry(value)
|
77
|
+
MapReduce.io_out.puts "Entry: #{value}"
|
78
|
+
end
|
79
|
+
def error(message)
|
80
|
+
MapReduce.io_err.puts message
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
class RailsOut < CollectCounters
|
85
|
+
def counter(group, counter, amount)
|
86
|
+
super
|
87
|
+
Rails.logger.debug "Counter: #{group}.#{counter} +#{amount}"
|
88
|
+
end
|
89
|
+
def status(status)
|
90
|
+
Rails.logger.info "Status: #{status}"
|
91
|
+
end
|
92
|
+
def map_entry(key, value)
|
93
|
+
Rails.logger.debug "MapEntry: #{key}\t#{value}"
|
94
|
+
end
|
95
|
+
def entry(value)
|
96
|
+
Rails.logger.debug "Entry: #{value}"
|
97
|
+
end
|
98
|
+
def error(message)
|
99
|
+
Rails.logger.error "Error: #{message}"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
class EmptyOut < CollectCounters
|
104
|
+
def status(status); end
|
105
|
+
def map_entry(key, value); end
|
106
|
+
def entry(value); end
|
107
|
+
def error(message)
|
108
|
+
MapReduce.io_err.puts message
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
class TestOut < CollectCounters
|
113
|
+
def entries
|
114
|
+
@entries ||= Array[]
|
115
|
+
end
|
116
|
+
def map_entry(key, value)
|
117
|
+
entries << "#{key}\t#{value}"
|
118
|
+
end
|
119
|
+
def entry(value)
|
120
|
+
entries << value
|
121
|
+
end
|
122
|
+
def error(message)
|
123
|
+
MapReduce.io_err.puts message
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module MapReduce
|
3
|
+
module Mappable
|
4
|
+
extend ActiveSupport::Concern
|
5
|
+
|
6
|
+
included do
|
7
|
+
include Rubadoop::MapReduce::Utils
|
8
|
+
end
|
9
|
+
|
10
|
+
# Process Hadoop input as a Mapper. Yields values line-by-line to supplied block.
|
11
|
+
# Supports returning array of full lines if no block supplied for testing only.
|
12
|
+
def mapper
|
13
|
+
|
14
|
+
lines = []
|
15
|
+
MapReduce.io_in.each_line do |line|
|
16
|
+
line.chomp!
|
17
|
+
if @input_ignore_key
|
18
|
+
key, line = key_value_split(line)
|
19
|
+
line = key if line.nil? && !key.nil?
|
20
|
+
end
|
21
|
+
if block_given?
|
22
|
+
yield line
|
23
|
+
else
|
24
|
+
lines << line
|
25
|
+
end
|
26
|
+
end
|
27
|
+
lines unless block_given?
|
28
|
+
end
|
29
|
+
|
30
|
+
def mapper_batched(batch_size = 50)
|
31
|
+
batch = []
|
32
|
+
batches = []
|
33
|
+
|
34
|
+
mapper do |line|
|
35
|
+
batch << line
|
36
|
+
if batch.size >= batch_size
|
37
|
+
if block_given?
|
38
|
+
yield batch
|
39
|
+
else
|
40
|
+
batches << batch
|
41
|
+
end
|
42
|
+
batch = []
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
if batch.size > 0
|
47
|
+
if block_given?
|
48
|
+
yield batch
|
49
|
+
else
|
50
|
+
batches << batch
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
batches unless block_given?
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module MapReduce
|
3
|
+
class Mapper
|
4
|
+
include Mappable
|
5
|
+
|
6
|
+
# * *Params* :
|
7
|
+
# - +input_ignore_key+ -> Ignore Key Parameter. Required for any Hadoop InputFormat other than TextInputFormat
|
8
|
+
def initialize(config = {})
|
9
|
+
@input_ignore_key = config[:input_ignore_key]
|
10
|
+
end
|
11
|
+
|
12
|
+
alias :process :mapper
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module MapReduce
|
3
|
+
module Reducable
|
4
|
+
extend ActiveSupport::Concern
|
5
|
+
|
6
|
+
included do
|
7
|
+
include Rubadoop::MapReduce::Utils
|
8
|
+
end
|
9
|
+
|
10
|
+
def reducer(&block)
|
11
|
+
# Use blocks for memory efficiency, the alternative is for testing only
|
12
|
+
# Entries come in per line: key\tvalue1 \n key\tvalue2 \n etc..
|
13
|
+
# This will pivot that to: key [values1, value2] etc..
|
14
|
+
|
15
|
+
lines = []
|
16
|
+
last_key = nil
|
17
|
+
enum = nil
|
18
|
+
|
19
|
+
line = MapReduce.io_in.gets
|
20
|
+
if line.nil?
|
21
|
+
return lines unless block_given?
|
22
|
+
return nil
|
23
|
+
end
|
24
|
+
line.chomp!
|
25
|
+
key, value = key_value_split(line)
|
26
|
+
|
27
|
+
last_key ||= key
|
28
|
+
loop do
|
29
|
+
if enum && !enum.flushed
|
30
|
+
enum.each { |v| }
|
31
|
+
break if (line == nil)
|
32
|
+
end
|
33
|
+
|
34
|
+
enum = TrackingEnumerator.new { |y|
|
35
|
+
y << value
|
36
|
+
loop do
|
37
|
+
line = MapReduce.io_in.gets
|
38
|
+
if line.nil?
|
39
|
+
enum.flushed = true
|
40
|
+
break
|
41
|
+
end
|
42
|
+
line.chomp!
|
43
|
+
|
44
|
+
key, value = key_value_split(line)
|
45
|
+
if key != last_key
|
46
|
+
enum.flushed = true
|
47
|
+
break
|
48
|
+
end
|
49
|
+
y << value
|
50
|
+
end
|
51
|
+
last_key = key
|
52
|
+
}
|
53
|
+
|
54
|
+
enum.flushed = false
|
55
|
+
if block_given?
|
56
|
+
block.call(last_key, enum)
|
57
|
+
else
|
58
|
+
lines << {key: last_key, values: enum.to_a} # must execute enum immediately here
|
59
|
+
end
|
60
|
+
break if (line == nil)
|
61
|
+
end
|
62
|
+
|
63
|
+
lines unless block_given?
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
class TrackingEnumerator < Enumerator
|
70
|
+
attr_accessor :flushed
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Rubadoop
|
2
|
+
module MapReduce
|
3
|
+
module TestAssist
|
4
|
+
|
5
|
+
def run_test_mapper(input)
|
6
|
+
run_with_test_io prepare_input(input) do
|
7
|
+
yield
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def run_test_reducer(input)
|
12
|
+
run_with_test_io prepare_input(input) do
|
13
|
+
yield
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def prepare_input(input)
|
20
|
+
case
|
21
|
+
when input.nil?
|
22
|
+
run_input = ''
|
23
|
+
when input.is_a?(Array)
|
24
|
+
in_io = StringIO.new
|
25
|
+
input.each do |item|
|
26
|
+
in_io.puts item
|
27
|
+
end
|
28
|
+
in_io.rewind
|
29
|
+
run_input = in_io
|
30
|
+
when input.is_a?(Hash)
|
31
|
+
in_io = StringIO.new
|
32
|
+
input.each do |key, value|
|
33
|
+
if value.is_a? Array
|
34
|
+
value.each do |entry|
|
35
|
+
in_io << key << "\t" << entry << "\n"
|
36
|
+
end
|
37
|
+
else
|
38
|
+
in_io << key << "\t" << value << "\n"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
in_io.rewind
|
42
|
+
run_input = in_io
|
43
|
+
else
|
44
|
+
run_input = input
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def run_with_test_io input_io
|
49
|
+
mr_out = ::Rubadoop::MapReduce.out
|
50
|
+
io_in = ::Rubadoop::MapReduce.io_in
|
51
|
+
io_out = ::Rubadoop::MapReduce.io_out
|
52
|
+
io_err = ::Rubadoop::MapReduce.io_err
|
53
|
+
::Rubadoop::MapReduce.io_in = input_io if input_io
|
54
|
+
test_io = TestOut.new
|
55
|
+
::Rubadoop::MapReduce.out = test_io
|
56
|
+
yield
|
57
|
+
::Rubadoop::MapReduce.out = mr_out
|
58
|
+
::Rubadoop::MapReduce.io_in = io_in
|
59
|
+
::Rubadoop::MapReduce.io_out = io_out
|
60
|
+
::Rubadoop::MapReduce.io_err = io_err
|
61
|
+
test_io
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|