rubadoop 0.7.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +7 -0
  3. data/Gemfile.lock +43 -0
  4. data/README.rdoc +7 -0
  5. data/Rakefile +25 -0
  6. data/lib/rubadoop.rb +26 -0
  7. data/lib/rubadoop/base_dsl.rb +31 -0
  8. data/lib/rubadoop/emr.rb +23 -0
  9. data/lib/rubadoop/emr/jobflow_builder.rb +36 -0
  10. data/lib/rubadoop/emr/jobflow_builder/bootstrap_action.rb +27 -0
  11. data/lib/rubadoop/emr/jobflow_builder/job_spec.rb +77 -0
  12. data/lib/rubadoop/emr/jobflow_builder/step.rb +41 -0
  13. data/lib/rubadoop/map_reduce.rb +23 -0
  14. data/lib/rubadoop/map_reduce/call_java.rb +112 -0
  15. data/lib/rubadoop/map_reduce/call_streaming.rb +55 -0
  16. data/lib/rubadoop/map_reduce/identity.rb +30 -0
  17. data/lib/rubadoop/map_reduce/io.rb +128 -0
  18. data/lib/rubadoop/map_reduce/job_conf_environment.rb +9 -0
  19. data/lib/rubadoop/map_reduce/mappable.rb +59 -0
  20. data/lib/rubadoop/map_reduce/mapper.rb +15 -0
  21. data/lib/rubadoop/map_reduce/reducable.rb +74 -0
  22. data/lib/rubadoop/map_reduce/reducer.rb +12 -0
  23. data/lib/rubadoop/map_reduce/test_assist.rb +65 -0
  24. data/lib/rubadoop/map_reduce/utils.rb +29 -0
  25. data/lib/rubadoop/oozie/workflow_builder.rb +42 -0
  26. data/lib/rubadoop/oozie/workflow_builder/job_properties.rb +19 -0
  27. data/lib/rubadoop/version.rb +3 -0
  28. data/test/rubadoop/base_dsl_test.rb +27 -0
  29. data/test/rubadoop/emr/jobflow_builder_test.rb +184 -0
  30. data/test/rubadoop/map_reduce/call_java_test.rb +122 -0
  31. data/test/rubadoop/map_reduce/call_streaming_test.rb +81 -0
  32. data/test/rubadoop/map_reduce/identity_test.rb +40 -0
  33. data/test/rubadoop/map_reduce/io_test.rb +51 -0
  34. data/test/rubadoop/map_reduce/job_conf_environment_test.rb +28 -0
  35. data/test/rubadoop/map_reduce/mappable_test.rb +62 -0
  36. data/test/rubadoop/map_reduce/mapper_test.rb +76 -0
  37. data/test/rubadoop/map_reduce/reducable_test.rb +12 -0
  38. data/test/rubadoop/map_reduce/reducer_test.rb +137 -0
  39. data/test/rubadoop/map_reduce/test_assist_test.rb +76 -0
  40. data/test/rubadoop/oozie/workflow_builder_test.rb +21 -0
  41. data/test/test_helper.rb +10 -0
  42. metadata +140 -0
@@ -0,0 +1,55 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ class CallStreaming < CallJava
4
+ attr_accessor :mapper, :reducer, :input, :output, :inputformat, :outputformat, :combiner
5
+
6
+ class << self
7
+ def new_streaming_call(params = {}, &block)
8
+ builder = CallStreaming.new(params)
9
+ if block_given?
10
+ if block.arity == 1
11
+ yield builder
12
+ else
13
+ builder.instance_eval &block
14
+ end
15
+ end
16
+ builder
17
+ end
18
+ end
19
+
20
+ def initialize(params = {})
21
+ super(params)
22
+ end
23
+
24
+ def to_hadoop_cli(opts = {})
25
+ cmd = super(opts.merge(skip_args: true))
26
+ cmd.concat ['-inputformat', "#{@inputformat}"] if @inputformat
27
+ cmd.concat ['-input', "#{@input}"] if @input
28
+ cmd.concat ['-outputformat', "#{@outputformat}"] if @outputformat
29
+ cmd.concat ['-output', "#{@output}"] if @output
30
+ cmd.concat ['-mapper', "#{@mapper}"] if @mapper
31
+ cmd.concat ['-reducer', "#{@reducer}"] if @reducer
32
+ cmd.concat ['-combiner', "#{@combiner}"] if @combiner
33
+ cmd.concat @args if @args
34
+ cmd
35
+ end
36
+
37
+ def to_h
38
+ built = super
39
+ [:@mapper, :@reducer, :@combiner, :@inputformat, :@input, :@outputformat, :@output].each { |entry|
40
+ value = instance_variable_get(entry)
41
+ built[entry.to_s.delete("@").to_sym] = value unless value.nil?
42
+ }
43
+ built
44
+ end
45
+
46
+ def validate
47
+ super
48
+ [:@mapper, :@reducer, :@input, :@output].each { |property|
49
+ raise "Missing #{property}" if instance_variable_get(property).nil?
50
+ }
51
+ end
52
+
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,30 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module Identity
4
+
5
+ class Mapper < MapReduce::Mapper
6
+ def initialize(config = {})
7
+ super(config)
8
+
9
+ mapper do |line|
10
+ MapReduce.out.entry(line)
11
+ end
12
+ end
13
+ end
14
+
15
+ class Reducer < MapReduce::Reducer
16
+ def initialize(config = {})
17
+ super(config)
18
+
19
+ reducer do |key, values|
20
+ values.each { |value|
21
+ MapReduce.out.entry "#{key}\t#{value}" unless value.nil?
22
+ MapReduce.out.entry key if value.nil?
23
+ }
24
+ end
25
+ end
26
+ end
27
+
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,128 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module Io
4
+ attr_writer :io_in, :io_out, :io_err
5
+ attr_writer :out
6
+
7
+ def out
8
+ @out ||= HadoopOut.new
9
+ end
10
+ def io_in; @io_in || STDIN end
11
+ def io_out; @io_out || STDOUT end
12
+ def io_err; @io_err || STDERR end
13
+
14
+ class << self
15
+ def set_silent_output
16
+ ::Rubadoop::MapReduce.out = EmptyOut.new
17
+ end
18
+ def set_standard_output
19
+ ::Rubadoop::MapReduce.out = StandardOut.new
20
+ end
21
+ def set_rails_output
22
+ ::Rubadoop::MapReduce.out = RailsOut.new
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ private
29
+
30
+ class CollectCounters
31
+ def counters
32
+ @counters ||= Hash[]
33
+ end
34
+
35
+ def counter(group, counter, amount)
36
+ c_group = counters[group] ||= Hash[]
37
+ if c_group[counter]
38
+ c_group[counter] += amount
39
+ else
40
+ c_group[counter] = amount
41
+ end
42
+ end
43
+ end
44
+
45
+ public
46
+
47
+ class HadoopOut
48
+ def counter(group, counter, amount)
49
+ MapReduce.io_err.puts "reporter:counter:#{group},#{counter},#{amount}"
50
+ end
51
+ def status(status)
52
+ MapReduce.io_err.puts "reporter:status:#{status}"
53
+ end
54
+ def map_entry(key, value)
55
+ MapReduce.io_out.puts "#{key}\t#{value}"
56
+ end
57
+ def entry(value)
58
+ MapReduce.io_out.puts value
59
+ end
60
+ def error(message)
61
+ MapReduce.io_err.puts message
62
+ end
63
+ end
64
+
65
+ class StandardOut < CollectCounters
66
+ def counter(group, counter, amount)
67
+ super
68
+ MapReduce.io_out.puts "Counter: #{group}.#{counter} +#{amount}"
69
+ end
70
+ def status(status)
71
+ MapReduce.io_out.puts "reporter:status:#{status}"
72
+ end
73
+ def map_entry(key, value)
74
+ MapReduce.io_out.puts "MapEntry: #{key}\t#{value}"
75
+ end
76
+ def entry(value)
77
+ MapReduce.io_out.puts "Entry: #{value}"
78
+ end
79
+ def error(message)
80
+ MapReduce.io_err.puts message
81
+ end
82
+ end
83
+
84
+ class RailsOut < CollectCounters
85
+ def counter(group, counter, amount)
86
+ super
87
+ Rails.logger.debug "Counter: #{group}.#{counter} +#{amount}"
88
+ end
89
+ def status(status)
90
+ Rails.logger.info "Status: #{status}"
91
+ end
92
+ def map_entry(key, value)
93
+ Rails.logger.debug "MapEntry: #{key}\t#{value}"
94
+ end
95
+ def entry(value)
96
+ Rails.logger.debug "Entry: #{value}"
97
+ end
98
+ def error(message)
99
+ Rails.logger.error "Error: #{message}"
100
+ end
101
+ end
102
+
103
+ class EmptyOut < CollectCounters
104
+ def status(status); end
105
+ def map_entry(key, value); end
106
+ def entry(value); end
107
+ def error(message)
108
+ MapReduce.io_err.puts message
109
+ end
110
+ end
111
+
112
+ class TestOut < CollectCounters
113
+ def entries
114
+ @entries ||= Array[]
115
+ end
116
+ def map_entry(key, value)
117
+ entries << "#{key}\t#{value}"
118
+ end
119
+ def entry(value)
120
+ entries << value
121
+ end
122
+ def error(message)
123
+ MapReduce.io_err.puts message
124
+ end
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,9 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module JobConfEnvironment
4
+ def job_conf_environment(name)
5
+ ENV[name.gsub(/[^0-9A-Za-z]/, '_')]
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,59 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module Mappable
4
+ extend ActiveSupport::Concern
5
+
6
+ included do
7
+ include Rubadoop::MapReduce::Utils
8
+ end
9
+
10
+ # Process Hadoop input as a Mapper. Yields values line-by-line to supplied block.
11
+ # Supports returning array of full lines if no block supplied for testing only.
12
+ def mapper
13
+
14
+ lines = []
15
+ MapReduce.io_in.each_line do |line|
16
+ line.chomp!
17
+ if @input_ignore_key
18
+ key, line = key_value_split(line)
19
+ line = key if line.nil? && !key.nil?
20
+ end
21
+ if block_given?
22
+ yield line
23
+ else
24
+ lines << line
25
+ end
26
+ end
27
+ lines unless block_given?
28
+ end
29
+
30
+ def mapper_batched(batch_size = 50)
31
+ batch = []
32
+ batches = []
33
+
34
+ mapper do |line|
35
+ batch << line
36
+ if batch.size >= batch_size
37
+ if block_given?
38
+ yield batch
39
+ else
40
+ batches << batch
41
+ end
42
+ batch = []
43
+ end
44
+ end
45
+
46
+ if batch.size > 0
47
+ if block_given?
48
+ yield batch
49
+ else
50
+ batches << batch
51
+ end
52
+ end
53
+
54
+ batches unless block_given?
55
+ end
56
+
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,15 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ class Mapper
4
+ include Mappable
5
+
6
+ # * *Params* :
7
+ # - +input_ignore_key+ -> Ignore Key Parameter. Required for any Hadoop InputFormat other than TextInputFormat
8
+ def initialize(config = {})
9
+ @input_ignore_key = config[:input_ignore_key]
10
+ end
11
+
12
+ alias :process :mapper
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,74 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module Reducable
4
+ extend ActiveSupport::Concern
5
+
6
+ included do
7
+ include Rubadoop::MapReduce::Utils
8
+ end
9
+
10
+ def reducer(&block)
11
+ # Use blocks for memory efficiency, the alternative is for testing only
12
+ # Entries come in per line: key\tvalue1 \n key\tvalue2 \n etc..
13
+ # This will pivot that to: key [values1, value2] etc..
14
+
15
+ lines = []
16
+ last_key = nil
17
+ enum = nil
18
+
19
+ line = MapReduce.io_in.gets
20
+ if line.nil?
21
+ return lines unless block_given?
22
+ return nil
23
+ end
24
+ line.chomp!
25
+ key, value = key_value_split(line)
26
+
27
+ last_key ||= key
28
+ loop do
29
+ if enum && !enum.flushed
30
+ enum.each { |v| }
31
+ break if (line == nil)
32
+ end
33
+
34
+ enum = TrackingEnumerator.new { |y|
35
+ y << value
36
+ loop do
37
+ line = MapReduce.io_in.gets
38
+ if line.nil?
39
+ enum.flushed = true
40
+ break
41
+ end
42
+ line.chomp!
43
+
44
+ key, value = key_value_split(line)
45
+ if key != last_key
46
+ enum.flushed = true
47
+ break
48
+ end
49
+ y << value
50
+ end
51
+ last_key = key
52
+ }
53
+
54
+ enum.flushed = false
55
+ if block_given?
56
+ block.call(last_key, enum)
57
+ else
58
+ lines << {key: last_key, values: enum.to_a} # must execute enum immediately here
59
+ end
60
+ break if (line == nil)
61
+ end
62
+
63
+ lines unless block_given?
64
+ end
65
+
66
+ end
67
+
68
+ private
69
+ class TrackingEnumerator < Enumerator
70
+ attr_accessor :flushed
71
+ end
72
+
73
+ end
74
+ end
@@ -0,0 +1,12 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ class Reducer
4
+ include Reducable
5
+
6
+ def initialize(config = {})
7
+ end
8
+
9
+ alias :process :reducer
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,65 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module TestAssist
4
+
5
+ def run_test_mapper(input)
6
+ run_with_test_io prepare_input(input) do
7
+ yield
8
+ end
9
+ end
10
+
11
+ def run_test_reducer(input)
12
+ run_with_test_io prepare_input(input) do
13
+ yield
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ def prepare_input(input)
20
+ case
21
+ when input.nil?
22
+ run_input = ''
23
+ when input.is_a?(Array)
24
+ in_io = StringIO.new
25
+ input.each do |item|
26
+ in_io.puts item
27
+ end
28
+ in_io.rewind
29
+ run_input = in_io
30
+ when input.is_a?(Hash)
31
+ in_io = StringIO.new
32
+ input.each do |key, value|
33
+ if value.is_a? Array
34
+ value.each do |entry|
35
+ in_io << key << "\t" << entry << "\n"
36
+ end
37
+ else
38
+ in_io << key << "\t" << value << "\n"
39
+ end
40
+ end
41
+ in_io.rewind
42
+ run_input = in_io
43
+ else
44
+ run_input = input
45
+ end
46
+ end
47
+
48
+ def run_with_test_io input_io
49
+ mr_out = ::Rubadoop::MapReduce.out
50
+ io_in = ::Rubadoop::MapReduce.io_in
51
+ io_out = ::Rubadoop::MapReduce.io_out
52
+ io_err = ::Rubadoop::MapReduce.io_err
53
+ ::Rubadoop::MapReduce.io_in = input_io if input_io
54
+ test_io = TestOut.new
55
+ ::Rubadoop::MapReduce.out = test_io
56
+ yield
57
+ ::Rubadoop::MapReduce.out = mr_out
58
+ ::Rubadoop::MapReduce.io_in = io_in
59
+ ::Rubadoop::MapReduce.io_out = io_out
60
+ ::Rubadoop::MapReduce.io_err = io_err
61
+ test_io
62
+ end
63
+ end
64
+ end
65
+ end