rubadoop 0.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +7 -0
  3. data/Gemfile.lock +43 -0
  4. data/README.rdoc +7 -0
  5. data/Rakefile +25 -0
  6. data/lib/rubadoop.rb +26 -0
  7. data/lib/rubadoop/base_dsl.rb +31 -0
  8. data/lib/rubadoop/emr.rb +23 -0
  9. data/lib/rubadoop/emr/jobflow_builder.rb +36 -0
  10. data/lib/rubadoop/emr/jobflow_builder/bootstrap_action.rb +27 -0
  11. data/lib/rubadoop/emr/jobflow_builder/job_spec.rb +77 -0
  12. data/lib/rubadoop/emr/jobflow_builder/step.rb +41 -0
  13. data/lib/rubadoop/map_reduce.rb +23 -0
  14. data/lib/rubadoop/map_reduce/call_java.rb +112 -0
  15. data/lib/rubadoop/map_reduce/call_streaming.rb +55 -0
  16. data/lib/rubadoop/map_reduce/identity.rb +30 -0
  17. data/lib/rubadoop/map_reduce/io.rb +128 -0
  18. data/lib/rubadoop/map_reduce/job_conf_environment.rb +9 -0
  19. data/lib/rubadoop/map_reduce/mappable.rb +59 -0
  20. data/lib/rubadoop/map_reduce/mapper.rb +15 -0
  21. data/lib/rubadoop/map_reduce/reducable.rb +74 -0
  22. data/lib/rubadoop/map_reduce/reducer.rb +12 -0
  23. data/lib/rubadoop/map_reduce/test_assist.rb +65 -0
  24. data/lib/rubadoop/map_reduce/utils.rb +29 -0
  25. data/lib/rubadoop/oozie/workflow_builder.rb +42 -0
  26. data/lib/rubadoop/oozie/workflow_builder/job_properties.rb +19 -0
  27. data/lib/rubadoop/version.rb +3 -0
  28. data/test/rubadoop/base_dsl_test.rb +27 -0
  29. data/test/rubadoop/emr/jobflow_builder_test.rb +184 -0
  30. data/test/rubadoop/map_reduce/call_java_test.rb +122 -0
  31. data/test/rubadoop/map_reduce/call_streaming_test.rb +81 -0
  32. data/test/rubadoop/map_reduce/identity_test.rb +40 -0
  33. data/test/rubadoop/map_reduce/io_test.rb +51 -0
  34. data/test/rubadoop/map_reduce/job_conf_environment_test.rb +28 -0
  35. data/test/rubadoop/map_reduce/mappable_test.rb +62 -0
  36. data/test/rubadoop/map_reduce/mapper_test.rb +76 -0
  37. data/test/rubadoop/map_reduce/reducable_test.rb +12 -0
  38. data/test/rubadoop/map_reduce/reducer_test.rb +137 -0
  39. data/test/rubadoop/map_reduce/test_assist_test.rb +76 -0
  40. data/test/rubadoop/oozie/workflow_builder_test.rb +21 -0
  41. data/test/test_helper.rb +10 -0
  42. metadata +140 -0
@@ -0,0 +1,55 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ class CallStreaming < CallJava
4
+ attr_accessor :mapper, :reducer, :input, :output, :inputformat, :outputformat, :combiner
5
+
6
+ class << self
7
+ def new_streaming_call(params = {}, &block)
8
+ builder = CallStreaming.new(params)
9
+ if block_given?
10
+ if block.arity == 1
11
+ yield builder
12
+ else
13
+ builder.instance_eval &block
14
+ end
15
+ end
16
+ builder
17
+ end
18
+ end
19
+
20
+ def initialize(params = {})
21
+ super(params)
22
+ end
23
+
24
+ def to_hadoop_cli(opts = {})
25
+ cmd = super(opts.merge(skip_args: true))
26
+ cmd.concat ['-inputformat', "#{@inputformat}"] if @inputformat
27
+ cmd.concat ['-input', "#{@input}"] if @input
28
+ cmd.concat ['-outputformat', "#{@outputformat}"] if @outputformat
29
+ cmd.concat ['-output', "#{@output}"] if @output
30
+ cmd.concat ['-mapper', "#{@mapper}"] if @mapper
31
+ cmd.concat ['-reducer', "#{@reducer}"] if @reducer
32
+ cmd.concat ['-combiner', "#{@combiner}"] if @combiner
33
+ cmd.concat @args if @args
34
+ cmd
35
+ end
36
+
37
+ def to_h
38
+ built = super
39
+ [:@mapper, :@reducer, :@combiner, :@inputformat, :@input, :@outputformat, :@output].each { |entry|
40
+ value = instance_variable_get(entry)
41
+ built[entry.to_s.delete("@").to_sym] = value unless value.nil?
42
+ }
43
+ built
44
+ end
45
+
46
+ def validate
47
+ super
48
+ [:@mapper, :@reducer, :@input, :@output].each { |property|
49
+ raise "Missing #{property}" if instance_variable_get(property).nil?
50
+ }
51
+ end
52
+
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,30 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module Identity
4
+
5
+ class Mapper < MapReduce::Mapper
6
+ def initialize(config = {})
7
+ super(config)
8
+
9
+ mapper do |line|
10
+ MapReduce.out.entry(line)
11
+ end
12
+ end
13
+ end
14
+
15
+ class Reducer < MapReduce::Reducer
16
+ def initialize(config = {})
17
+ super(config)
18
+
19
+ reducer do |key, values|
20
+ values.each { |value|
21
+ MapReduce.out.entry "#{key}\t#{value}" unless value.nil?
22
+ MapReduce.out.entry key if value.nil?
23
+ }
24
+ end
25
+ end
26
+ end
27
+
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,128 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module Io
4
+ attr_writer :io_in, :io_out, :io_err
5
+ attr_writer :out
6
+
7
+ def out
8
+ @out ||= HadoopOut.new
9
+ end
10
+ def io_in; @io_in || STDIN end
11
+ def io_out; @io_out || STDOUT end
12
+ def io_err; @io_err || STDERR end
13
+
14
+ class << self
15
+ def set_silent_output
16
+ ::Rubadoop::MapReduce.out = EmptyOut.new
17
+ end
18
+ def set_standard_output
19
+ ::Rubadoop::MapReduce.out = StandardOut.new
20
+ end
21
+ def set_rails_output
22
+ ::Rubadoop::MapReduce.out = RailsOut.new
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ private
29
+
30
+ class CollectCounters
31
+ def counters
32
+ @counters ||= Hash[]
33
+ end
34
+
35
+ def counter(group, counter, amount)
36
+ c_group = counters[group] ||= Hash[]
37
+ if c_group[counter]
38
+ c_group[counter] += amount
39
+ else
40
+ c_group[counter] = amount
41
+ end
42
+ end
43
+ end
44
+
45
+ public
46
+
47
+ class HadoopOut
48
+ def counter(group, counter, amount)
49
+ MapReduce.io_err.puts "reporter:counter:#{group},#{counter},#{amount}"
50
+ end
51
+ def status(status)
52
+ MapReduce.io_err.puts "reporter:status:#{status}"
53
+ end
54
+ def map_entry(key, value)
55
+ MapReduce.io_out.puts "#{key}\t#{value}"
56
+ end
57
+ def entry(value)
58
+ MapReduce.io_out.puts value
59
+ end
60
+ def error(message)
61
+ MapReduce.io_err.puts message
62
+ end
63
+ end
64
+
65
+ class StandardOut < CollectCounters
66
+ def counter(group, counter, amount)
67
+ super
68
+ MapReduce.io_out.puts "Counter: #{group}.#{counter} +#{amount}"
69
+ end
70
+ def status(status)
71
+ MapReduce.io_out.puts "reporter:status:#{status}"
72
+ end
73
+ def map_entry(key, value)
74
+ MapReduce.io_out.puts "MapEntry: #{key}\t#{value}"
75
+ end
76
+ def entry(value)
77
+ MapReduce.io_out.puts "Entry: #{value}"
78
+ end
79
+ def error(message)
80
+ MapReduce.io_err.puts message
81
+ end
82
+ end
83
+
84
+ class RailsOut < CollectCounters
85
+ def counter(group, counter, amount)
86
+ super
87
+ Rails.logger.debug "Counter: #{group}.#{counter} +#{amount}"
88
+ end
89
+ def status(status)
90
+ Rails.logger.info "Status: #{status}"
91
+ end
92
+ def map_entry(key, value)
93
+ Rails.logger.debug "MapEntry: #{key}\t#{value}"
94
+ end
95
+ def entry(value)
96
+ Rails.logger.debug "Entry: #{value}"
97
+ end
98
+ def error(message)
99
+ Rails.logger.error "Error: #{message}"
100
+ end
101
+ end
102
+
103
+ class EmptyOut < CollectCounters
104
+ def status(status); end
105
+ def map_entry(key, value); end
106
+ def entry(value); end
107
+ def error(message)
108
+ MapReduce.io_err.puts message
109
+ end
110
+ end
111
+
112
+ class TestOut < CollectCounters
113
+ def entries
114
+ @entries ||= Array[]
115
+ end
116
+ def map_entry(key, value)
117
+ entries << "#{key}\t#{value}"
118
+ end
119
+ def entry(value)
120
+ entries << value
121
+ end
122
+ def error(message)
123
+ MapReduce.io_err.puts message
124
+ end
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,9 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module JobConfEnvironment
4
+ def job_conf_environment(name)
5
+ ENV[name.gsub(/[^0-9A-Za-z]/, '_')]
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,59 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module Mappable
4
+ extend ActiveSupport::Concern
5
+
6
+ included do
7
+ include Rubadoop::MapReduce::Utils
8
+ end
9
+
10
+ # Process Hadoop input as a Mapper. Yields values line-by-line to supplied block.
11
+ # Supports returning array of full lines if no block supplied for testing only.
12
+ def mapper
13
+
14
+ lines = []
15
+ MapReduce.io_in.each_line do |line|
16
+ line.chomp!
17
+ if @input_ignore_key
18
+ key, line = key_value_split(line)
19
+ line = key if line.nil? && !key.nil?
20
+ end
21
+ if block_given?
22
+ yield line
23
+ else
24
+ lines << line
25
+ end
26
+ end
27
+ lines unless block_given?
28
+ end
29
+
30
+ def mapper_batched(batch_size = 50)
31
+ batch = []
32
+ batches = []
33
+
34
+ mapper do |line|
35
+ batch << line
36
+ if batch.size >= batch_size
37
+ if block_given?
38
+ yield batch
39
+ else
40
+ batches << batch
41
+ end
42
+ batch = []
43
+ end
44
+ end
45
+
46
+ if batch.size > 0
47
+ if block_given?
48
+ yield batch
49
+ else
50
+ batches << batch
51
+ end
52
+ end
53
+
54
+ batches unless block_given?
55
+ end
56
+
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,15 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ class Mapper
4
+ include Mappable
5
+
6
+ # * *Params* :
7
+ # - +input_ignore_key+ -> Ignore Key Parameter. Required for any Hadoop InputFormat other than TextInputFormat
8
+ def initialize(config = {})
9
+ @input_ignore_key = config[:input_ignore_key]
10
+ end
11
+
12
+ alias :process :mapper
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,74 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module Reducable
4
+ extend ActiveSupport::Concern
5
+
6
+ included do
7
+ include Rubadoop::MapReduce::Utils
8
+ end
9
+
10
+ def reducer(&block)
11
+ # Use blocks for memory efficiency, the alternative is for testing only
12
+ # Entries come in per line: key\tvalue1 \n key\tvalue2 \n etc..
13
+ # This will pivot that to: key [values1, value2] etc..
14
+
15
+ lines = []
16
+ last_key = nil
17
+ enum = nil
18
+
19
+ line = MapReduce.io_in.gets
20
+ if line.nil?
21
+ return lines unless block_given?
22
+ return nil
23
+ end
24
+ line.chomp!
25
+ key, value = key_value_split(line)
26
+
27
+ last_key ||= key
28
+ loop do
29
+ if enum && !enum.flushed
30
+ enum.each { |v| }
31
+ break if (line == nil)
32
+ end
33
+
34
+ enum = TrackingEnumerator.new { |y|
35
+ y << value
36
+ loop do
37
+ line = MapReduce.io_in.gets
38
+ if line.nil?
39
+ enum.flushed = true
40
+ break
41
+ end
42
+ line.chomp!
43
+
44
+ key, value = key_value_split(line)
45
+ if key != last_key
46
+ enum.flushed = true
47
+ break
48
+ end
49
+ y << value
50
+ end
51
+ last_key = key
52
+ }
53
+
54
+ enum.flushed = false
55
+ if block_given?
56
+ block.call(last_key, enum)
57
+ else
58
+ lines << {key: last_key, values: enum.to_a} # must execute enum immediately here
59
+ end
60
+ break if (line == nil)
61
+ end
62
+
63
+ lines unless block_given?
64
+ end
65
+
66
+ end
67
+
68
+ private
69
+ class TrackingEnumerator < Enumerator
70
+ attr_accessor :flushed
71
+ end
72
+
73
+ end
74
+ end
@@ -0,0 +1,12 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ class Reducer
4
+ include Reducable
5
+
6
+ def initialize(config = {})
7
+ end
8
+
9
+ alias :process :reducer
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,65 @@
1
+ module Rubadoop
2
+ module MapReduce
3
+ module TestAssist
4
+
5
+ def run_test_mapper(input)
6
+ run_with_test_io prepare_input(input) do
7
+ yield
8
+ end
9
+ end
10
+
11
+ def run_test_reducer(input)
12
+ run_with_test_io prepare_input(input) do
13
+ yield
14
+ end
15
+ end
16
+
17
+ private
18
+
19
+ def prepare_input(input)
20
+ case
21
+ when input.nil?
22
+ run_input = ''
23
+ when input.is_a?(Array)
24
+ in_io = StringIO.new
25
+ input.each do |item|
26
+ in_io.puts item
27
+ end
28
+ in_io.rewind
29
+ run_input = in_io
30
+ when input.is_a?(Hash)
31
+ in_io = StringIO.new
32
+ input.each do |key, value|
33
+ if value.is_a? Array
34
+ value.each do |entry|
35
+ in_io << key << "\t" << entry << "\n"
36
+ end
37
+ else
38
+ in_io << key << "\t" << value << "\n"
39
+ end
40
+ end
41
+ in_io.rewind
42
+ run_input = in_io
43
+ else
44
+ run_input = input
45
+ end
46
+ end
47
+
48
+ def run_with_test_io input_io
49
+ mr_out = ::Rubadoop::MapReduce.out
50
+ io_in = ::Rubadoop::MapReduce.io_in
51
+ io_out = ::Rubadoop::MapReduce.io_out
52
+ io_err = ::Rubadoop::MapReduce.io_err
53
+ ::Rubadoop::MapReduce.io_in = input_io if input_io
54
+ test_io = TestOut.new
55
+ ::Rubadoop::MapReduce.out = test_io
56
+ yield
57
+ ::Rubadoop::MapReduce.out = mr_out
58
+ ::Rubadoop::MapReduce.io_in = io_in
59
+ ::Rubadoop::MapReduce.io_out = io_out
60
+ ::Rubadoop::MapReduce.io_err = io_err
61
+ test_io
62
+ end
63
+ end
64
+ end
65
+ end