rubadoop 0.7.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +43 -0
- data/README.rdoc +7 -0
- data/Rakefile +25 -0
- data/lib/rubadoop.rb +26 -0
- data/lib/rubadoop/base_dsl.rb +31 -0
- data/lib/rubadoop/emr.rb +23 -0
- data/lib/rubadoop/emr/jobflow_builder.rb +36 -0
- data/lib/rubadoop/emr/jobflow_builder/bootstrap_action.rb +27 -0
- data/lib/rubadoop/emr/jobflow_builder/job_spec.rb +77 -0
- data/lib/rubadoop/emr/jobflow_builder/step.rb +41 -0
- data/lib/rubadoop/map_reduce.rb +23 -0
- data/lib/rubadoop/map_reduce/call_java.rb +112 -0
- data/lib/rubadoop/map_reduce/call_streaming.rb +55 -0
- data/lib/rubadoop/map_reduce/identity.rb +30 -0
- data/lib/rubadoop/map_reduce/io.rb +128 -0
- data/lib/rubadoop/map_reduce/job_conf_environment.rb +9 -0
- data/lib/rubadoop/map_reduce/mappable.rb +59 -0
- data/lib/rubadoop/map_reduce/mapper.rb +15 -0
- data/lib/rubadoop/map_reduce/reducable.rb +74 -0
- data/lib/rubadoop/map_reduce/reducer.rb +12 -0
- data/lib/rubadoop/map_reduce/test_assist.rb +65 -0
- data/lib/rubadoop/map_reduce/utils.rb +29 -0
- data/lib/rubadoop/oozie/workflow_builder.rb +42 -0
- data/lib/rubadoop/oozie/workflow_builder/job_properties.rb +19 -0
- data/lib/rubadoop/version.rb +3 -0
- data/test/rubadoop/base_dsl_test.rb +27 -0
- data/test/rubadoop/emr/jobflow_builder_test.rb +184 -0
- data/test/rubadoop/map_reduce/call_java_test.rb +122 -0
- data/test/rubadoop/map_reduce/call_streaming_test.rb +81 -0
- data/test/rubadoop/map_reduce/identity_test.rb +40 -0
- data/test/rubadoop/map_reduce/io_test.rb +51 -0
- data/test/rubadoop/map_reduce/job_conf_environment_test.rb +28 -0
- data/test/rubadoop/map_reduce/mappable_test.rb +62 -0
- data/test/rubadoop/map_reduce/mapper_test.rb +76 -0
- data/test/rubadoop/map_reduce/reducable_test.rb +12 -0
- data/test/rubadoop/map_reduce/reducer_test.rb +137 -0
- data/test/rubadoop/map_reduce/test_assist_test.rb +76 -0
- data/test/rubadoop/oozie/workflow_builder_test.rb +21 -0
- data/test/test_helper.rb +10 -0
- metadata +140 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
module Rubadoop
|
|
2
|
+
module MapReduce
|
|
3
|
+
class CallStreaming < CallJava
|
|
4
|
+
attr_accessor :mapper, :reducer, :input, :output, :inputformat, :outputformat, :combiner
|
|
5
|
+
|
|
6
|
+
class << self
|
|
7
|
+
def new_streaming_call(params = {}, &block)
|
|
8
|
+
builder = CallStreaming.new(params)
|
|
9
|
+
if block_given?
|
|
10
|
+
if block.arity == 1
|
|
11
|
+
yield builder
|
|
12
|
+
else
|
|
13
|
+
builder.instance_eval &block
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
builder
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def initialize(params = {})
|
|
21
|
+
super(params)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def to_hadoop_cli(opts = {})
|
|
25
|
+
cmd = super(opts.merge(skip_args: true))
|
|
26
|
+
cmd.concat ['-inputformat', "#{@inputformat}"] if @inputformat
|
|
27
|
+
cmd.concat ['-input', "#{@input}"] if @input
|
|
28
|
+
cmd.concat ['-outputformat', "#{@outputformat}"] if @outputformat
|
|
29
|
+
cmd.concat ['-output', "#{@output}"] if @output
|
|
30
|
+
cmd.concat ['-mapper', "#{@mapper}"] if @mapper
|
|
31
|
+
cmd.concat ['-reducer', "#{@reducer}"] if @reducer
|
|
32
|
+
cmd.concat ['-combiner', "#{@combiner}"] if @combiner
|
|
33
|
+
cmd.concat @args if @args
|
|
34
|
+
cmd
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def to_h
|
|
38
|
+
built = super
|
|
39
|
+
[:@mapper, :@reducer, :@combiner, :@inputformat, :@input, :@outputformat, :@output].each { |entry|
|
|
40
|
+
value = instance_variable_get(entry)
|
|
41
|
+
built[entry.to_s.delete("@").to_sym] = value unless value.nil?
|
|
42
|
+
}
|
|
43
|
+
built
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def validate
|
|
47
|
+
super
|
|
48
|
+
[:@mapper, :@reducer, :@input, :@output].each { |property|
|
|
49
|
+
raise "Missing #{property}" if instance_variable_get(property).nil?
|
|
50
|
+
}
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
module Rubadoop
|
|
2
|
+
module MapReduce
|
|
3
|
+
module Identity
|
|
4
|
+
|
|
5
|
+
class Mapper < MapReduce::Mapper
|
|
6
|
+
def initialize(config = {})
|
|
7
|
+
super(config)
|
|
8
|
+
|
|
9
|
+
mapper do |line|
|
|
10
|
+
MapReduce.out.entry(line)
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
class Reducer < MapReduce::Reducer
|
|
16
|
+
def initialize(config = {})
|
|
17
|
+
super(config)
|
|
18
|
+
|
|
19
|
+
reducer do |key, values|
|
|
20
|
+
values.each { |value|
|
|
21
|
+
MapReduce.out.entry "#{key}\t#{value}" unless value.nil?
|
|
22
|
+
MapReduce.out.entry key if value.nil?
|
|
23
|
+
}
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
module Rubadoop
|
|
2
|
+
module MapReduce
|
|
3
|
+
module Io
|
|
4
|
+
attr_writer :io_in, :io_out, :io_err
|
|
5
|
+
attr_writer :out
|
|
6
|
+
|
|
7
|
+
def out
|
|
8
|
+
@out ||= HadoopOut.new
|
|
9
|
+
end
|
|
10
|
+
def io_in; @io_in || STDIN end
|
|
11
|
+
def io_out; @io_out || STDOUT end
|
|
12
|
+
def io_err; @io_err || STDERR end
|
|
13
|
+
|
|
14
|
+
class << self
|
|
15
|
+
def set_silent_output
|
|
16
|
+
::Rubadoop::MapReduce.out = EmptyOut.new
|
|
17
|
+
end
|
|
18
|
+
def set_standard_output
|
|
19
|
+
::Rubadoop::MapReduce.out = StandardOut.new
|
|
20
|
+
end
|
|
21
|
+
def set_rails_output
|
|
22
|
+
::Rubadoop::MapReduce.out = RailsOut.new
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
class CollectCounters
|
|
31
|
+
def counters
|
|
32
|
+
@counters ||= Hash[]
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def counter(group, counter, amount)
|
|
36
|
+
c_group = counters[group] ||= Hash[]
|
|
37
|
+
if c_group[counter]
|
|
38
|
+
c_group[counter] += amount
|
|
39
|
+
else
|
|
40
|
+
c_group[counter] = amount
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
public
|
|
46
|
+
|
|
47
|
+
class HadoopOut
|
|
48
|
+
def counter(group, counter, amount)
|
|
49
|
+
MapReduce.io_err.puts "reporter:counter:#{group},#{counter},#{amount}"
|
|
50
|
+
end
|
|
51
|
+
def status(status)
|
|
52
|
+
MapReduce.io_err.puts "reporter:status:#{status}"
|
|
53
|
+
end
|
|
54
|
+
def map_entry(key, value)
|
|
55
|
+
MapReduce.io_out.puts "#{key}\t#{value}"
|
|
56
|
+
end
|
|
57
|
+
def entry(value)
|
|
58
|
+
MapReduce.io_out.puts value
|
|
59
|
+
end
|
|
60
|
+
def error(message)
|
|
61
|
+
MapReduce.io_err.puts message
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
class StandardOut < CollectCounters
|
|
66
|
+
def counter(group, counter, amount)
|
|
67
|
+
super
|
|
68
|
+
MapReduce.io_out.puts "Counter: #{group}.#{counter} +#{amount}"
|
|
69
|
+
end
|
|
70
|
+
def status(status)
|
|
71
|
+
MapReduce.io_out.puts "reporter:status:#{status}"
|
|
72
|
+
end
|
|
73
|
+
def map_entry(key, value)
|
|
74
|
+
MapReduce.io_out.puts "MapEntry: #{key}\t#{value}"
|
|
75
|
+
end
|
|
76
|
+
def entry(value)
|
|
77
|
+
MapReduce.io_out.puts "Entry: #{value}"
|
|
78
|
+
end
|
|
79
|
+
def error(message)
|
|
80
|
+
MapReduce.io_err.puts message
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
class RailsOut < CollectCounters
|
|
85
|
+
def counter(group, counter, amount)
|
|
86
|
+
super
|
|
87
|
+
Rails.logger.debug "Counter: #{group}.#{counter} +#{amount}"
|
|
88
|
+
end
|
|
89
|
+
def status(status)
|
|
90
|
+
Rails.logger.info "Status: #{status}"
|
|
91
|
+
end
|
|
92
|
+
def map_entry(key, value)
|
|
93
|
+
Rails.logger.debug "MapEntry: #{key}\t#{value}"
|
|
94
|
+
end
|
|
95
|
+
def entry(value)
|
|
96
|
+
Rails.logger.debug "Entry: #{value}"
|
|
97
|
+
end
|
|
98
|
+
def error(message)
|
|
99
|
+
Rails.logger.error "Error: #{message}"
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
class EmptyOut < CollectCounters
|
|
104
|
+
def status(status); end
|
|
105
|
+
def map_entry(key, value); end
|
|
106
|
+
def entry(value); end
|
|
107
|
+
def error(message)
|
|
108
|
+
MapReduce.io_err.puts message
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
class TestOut < CollectCounters
|
|
113
|
+
def entries
|
|
114
|
+
@entries ||= Array[]
|
|
115
|
+
end
|
|
116
|
+
def map_entry(key, value)
|
|
117
|
+
entries << "#{key}\t#{value}"
|
|
118
|
+
end
|
|
119
|
+
def entry(value)
|
|
120
|
+
entries << value
|
|
121
|
+
end
|
|
122
|
+
def error(message)
|
|
123
|
+
MapReduce.io_err.puts message
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
end
|
|
128
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
module Rubadoop
|
|
2
|
+
module MapReduce
|
|
3
|
+
module Mappable
|
|
4
|
+
extend ActiveSupport::Concern
|
|
5
|
+
|
|
6
|
+
included do
|
|
7
|
+
include Rubadoop::MapReduce::Utils
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
# Process Hadoop input as a Mapper. Yields values line-by-line to supplied block.
|
|
11
|
+
# Supports returning array of full lines if no block supplied for testing only.
|
|
12
|
+
def mapper
|
|
13
|
+
|
|
14
|
+
lines = []
|
|
15
|
+
MapReduce.io_in.each_line do |line|
|
|
16
|
+
line.chomp!
|
|
17
|
+
if @input_ignore_key
|
|
18
|
+
key, line = key_value_split(line)
|
|
19
|
+
line = key if line.nil? && !key.nil?
|
|
20
|
+
end
|
|
21
|
+
if block_given?
|
|
22
|
+
yield line
|
|
23
|
+
else
|
|
24
|
+
lines << line
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
lines unless block_given?
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def mapper_batched(batch_size = 50)
|
|
31
|
+
batch = []
|
|
32
|
+
batches = []
|
|
33
|
+
|
|
34
|
+
mapper do |line|
|
|
35
|
+
batch << line
|
|
36
|
+
if batch.size >= batch_size
|
|
37
|
+
if block_given?
|
|
38
|
+
yield batch
|
|
39
|
+
else
|
|
40
|
+
batches << batch
|
|
41
|
+
end
|
|
42
|
+
batch = []
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
if batch.size > 0
|
|
47
|
+
if block_given?
|
|
48
|
+
yield batch
|
|
49
|
+
else
|
|
50
|
+
batches << batch
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
batches unless block_given?
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module Rubadoop
|
|
2
|
+
module MapReduce
|
|
3
|
+
class Mapper
|
|
4
|
+
include Mappable
|
|
5
|
+
|
|
6
|
+
# * *Params* :
|
|
7
|
+
# - +input_ignore_key+ -> Ignore Key Parameter. Required for any Hadoop InputFormat other than TextInputFormat
|
|
8
|
+
def initialize(config = {})
|
|
9
|
+
@input_ignore_key = config[:input_ignore_key]
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
alias :process :mapper
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
module Rubadoop
|
|
2
|
+
module MapReduce
|
|
3
|
+
module Reducable
|
|
4
|
+
extend ActiveSupport::Concern
|
|
5
|
+
|
|
6
|
+
included do
|
|
7
|
+
include Rubadoop::MapReduce::Utils
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def reducer(&block)
|
|
11
|
+
# Use blocks for memory efficiency, the alternative is for testing only
|
|
12
|
+
# Entries come in per line: key\tvalue1 \n key\tvalue2 \n etc..
|
|
13
|
+
# This will pivot that to: key [values1, value2] etc..
|
|
14
|
+
|
|
15
|
+
lines = []
|
|
16
|
+
last_key = nil
|
|
17
|
+
enum = nil
|
|
18
|
+
|
|
19
|
+
line = MapReduce.io_in.gets
|
|
20
|
+
if line.nil?
|
|
21
|
+
return lines unless block_given?
|
|
22
|
+
return nil
|
|
23
|
+
end
|
|
24
|
+
line.chomp!
|
|
25
|
+
key, value = key_value_split(line)
|
|
26
|
+
|
|
27
|
+
last_key ||= key
|
|
28
|
+
loop do
|
|
29
|
+
if enum && !enum.flushed
|
|
30
|
+
enum.each { |v| }
|
|
31
|
+
break if (line == nil)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
enum = TrackingEnumerator.new { |y|
|
|
35
|
+
y << value
|
|
36
|
+
loop do
|
|
37
|
+
line = MapReduce.io_in.gets
|
|
38
|
+
if line.nil?
|
|
39
|
+
enum.flushed = true
|
|
40
|
+
break
|
|
41
|
+
end
|
|
42
|
+
line.chomp!
|
|
43
|
+
|
|
44
|
+
key, value = key_value_split(line)
|
|
45
|
+
if key != last_key
|
|
46
|
+
enum.flushed = true
|
|
47
|
+
break
|
|
48
|
+
end
|
|
49
|
+
y << value
|
|
50
|
+
end
|
|
51
|
+
last_key = key
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
enum.flushed = false
|
|
55
|
+
if block_given?
|
|
56
|
+
block.call(last_key, enum)
|
|
57
|
+
else
|
|
58
|
+
lines << {key: last_key, values: enum.to_a} # must execute enum immediately here
|
|
59
|
+
end
|
|
60
|
+
break if (line == nil)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
lines unless block_given?
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
class TrackingEnumerator < Enumerator
|
|
70
|
+
attr_accessor :flushed
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
module Rubadoop
|
|
2
|
+
module MapReduce
|
|
3
|
+
module TestAssist
|
|
4
|
+
|
|
5
|
+
def run_test_mapper(input)
|
|
6
|
+
run_with_test_io prepare_input(input) do
|
|
7
|
+
yield
|
|
8
|
+
end
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def run_test_reducer(input)
|
|
12
|
+
run_with_test_io prepare_input(input) do
|
|
13
|
+
yield
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def prepare_input(input)
|
|
20
|
+
case
|
|
21
|
+
when input.nil?
|
|
22
|
+
run_input = ''
|
|
23
|
+
when input.is_a?(Array)
|
|
24
|
+
in_io = StringIO.new
|
|
25
|
+
input.each do |item|
|
|
26
|
+
in_io.puts item
|
|
27
|
+
end
|
|
28
|
+
in_io.rewind
|
|
29
|
+
run_input = in_io
|
|
30
|
+
when input.is_a?(Hash)
|
|
31
|
+
in_io = StringIO.new
|
|
32
|
+
input.each do |key, value|
|
|
33
|
+
if value.is_a? Array
|
|
34
|
+
value.each do |entry|
|
|
35
|
+
in_io << key << "\t" << entry << "\n"
|
|
36
|
+
end
|
|
37
|
+
else
|
|
38
|
+
in_io << key << "\t" << value << "\n"
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
in_io.rewind
|
|
42
|
+
run_input = in_io
|
|
43
|
+
else
|
|
44
|
+
run_input = input
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def run_with_test_io input_io
|
|
49
|
+
mr_out = ::Rubadoop::MapReduce.out
|
|
50
|
+
io_in = ::Rubadoop::MapReduce.io_in
|
|
51
|
+
io_out = ::Rubadoop::MapReduce.io_out
|
|
52
|
+
io_err = ::Rubadoop::MapReduce.io_err
|
|
53
|
+
::Rubadoop::MapReduce.io_in = input_io if input_io
|
|
54
|
+
test_io = TestOut.new
|
|
55
|
+
::Rubadoop::MapReduce.out = test_io
|
|
56
|
+
yield
|
|
57
|
+
::Rubadoop::MapReduce.out = mr_out
|
|
58
|
+
::Rubadoop::MapReduce.io_in = io_in
|
|
59
|
+
::Rubadoop::MapReduce.io_out = io_out
|
|
60
|
+
::Rubadoop::MapReduce.io_err = io_err
|
|
61
|
+
test_io
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|