streaming_join 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG ADDED
@@ -0,0 +1,2 @@
1
+ * 2011-08-19 - fsf
2
+ - initial import
data/README ADDED
@@ -0,0 +1,23 @@
1
+ streaming_join
2
+
3
+ Ruby classes intended to be used in Hadoop Streaming reducers. It
4
+ has been tested with jruby 1.6+ and ruby 1.9.2+.
5
+
6
+ Examples (found in the examples directory) use the test data and
7
+ scenarios from here:
8
+
9
+ http://en.wikipedia.org/wiki/Join_(SQL)
10
+
11
+ The equivalent sql for each example is listed in its directory. The
12
+ supported join types are:
13
+
14
+ inner_join
15
+ cross_join
16
+ left_outer_join
17
+ right_outer_join
18
+ full_outer_join
19
+ merge_rows
20
+
21
+ Please let me know if you find this software useful!
22
+
23
+ --frank
@@ -0,0 +1,30 @@
1
+ require 'streaming_join/join'
2
+
3
+ class CrossJoin < Join
4
+ def output
5
+ left,right = @join
6
+
7
+ left.each do |lk,lv|
8
+ right.each do |rk,rv|
9
+ o = "#{lk}#{@sep_out}#{lv}#{@sep_out}#{rk}#{@sep_out}#{rv}"
10
+ if block_given?
11
+ yield o
12
+ else
13
+ puts o
14
+ end
15
+ end
16
+ end
17
+ end
18
+
19
+ def process_stream(input = STDIN, &blk)
20
+ @join = [] # big memory, big prizes
21
+
22
+ input.each do |line|
23
+ key, side, value = line.chomp.split(@sep_in, 3)
24
+
25
+ (@join[side.to_i] ||= []) << [key, value]
26
+ end
27
+
28
+ output(&blk) if not @join.empty?
29
+ end
30
+ end
@@ -0,0 +1,59 @@
1
+ require 'streaming_join/join'
2
+
3
+ class FullOuterJoin < Join
4
+ def output key
5
+ report 'keys'
6
+
7
+ left,right = @join
8
+ if not left
9
+ report 'null left'
10
+ else
11
+ report 'left keys'
12
+ end
13
+ if not right
14
+ report 'null right'
15
+ else
16
+ report 'right keys'
17
+ end
18
+
19
+ #p left, right
20
+
21
+ # the number of columns on the sides can be passed in as env variables
22
+ # so that the full record with empty "" values can be displayed even if
23
+ # there is no match
24
+ cols_r = ENV['streaming_join_cols_right'].to_i
25
+ cols_l = ENV['streaming_join_cols_left'].to_i
26
+
27
+ left.each do |l|
28
+ if right
29
+ report 'left and right'
30
+ right.each do |r|
31
+ o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
32
+ block_given? ? (yield o) : (puts o)
33
+ end
34
+ elsif cols_r > 0
35
+ report 'null right'
36
+ o = "#{key}#{@sep_out}"
37
+ o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
38
+ block_given? ? (yield o) : (puts o)
39
+ else
40
+ report 'null right'
41
+ o = "#{key}#{@sep_out}#{l}"
42
+ block_given? ? (yield o) : (puts o)
43
+ end
44
+ end if left
45
+
46
+ right.each do |r|
47
+ next if left
48
+ report 'null left'
49
+ if cols_l > 0
50
+ o = "#{key}#{@sep_out}"
51
+ o << "#{Array.new(cols_l).join(@sep_out)}#{@sep_out}#{r}"
52
+ else
53
+ o = "#{key}#{@sep_out}#{r}"
54
+ end
55
+
56
+ block_given? ? (yield o) : (puts o)
57
+ end if right
58
+ end
59
+ end
@@ -0,0 +1,56 @@
1
+ # base Join class (inner join)
2
+ class Join
3
+ def initialize
4
+ @sep_in = ENV['stream_map_output_field_separator'] || "\t"
5
+ @sep_out = ENV['streaming_join_output_separator'] || "\t"
6
+ @sep_out = $1.hex.chr if @sep_out =~ /\A(?:\\u?)?(\d+)\Z/
7
+ end
8
+
9
+ def report detail
10
+ STDERR.puts "reporter:counter:join,#{detail},1"
11
+ end
12
+
13
+ def output key
14
+ report 'keys'
15
+
16
+ left,right = @join
17
+ if not left
18
+ report 'null left'
19
+ return
20
+ elsif not right
21
+ report 'null right'
22
+ return
23
+ end
24
+
25
+ left.each do |l|
26
+ report 'left and right'
27
+ right.each do |r|
28
+ o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
29
+ if block_given?
30
+ yield o
31
+ else
32
+ puts o
33
+ end
34
+ end
35
+ end
36
+ end
37
+
38
+ def process_stream(input = STDIN, &blk)
39
+ last_key = key = nil
40
+ @join = []
41
+
42
+ input.each do |line|
43
+ key, side, value = line.chomp.split(@sep_in, 3)
44
+
45
+ if last_key and last_key != key
46
+ output(last_key, &blk)
47
+ @join = []
48
+ end
49
+
50
+ (@join[side.to_i] ||= []) << value
51
+ last_key = key
52
+ end
53
+
54
+ output(last_key, &blk) if key
55
+ end
56
+ end
@@ -0,0 +1,39 @@
1
+ require 'streaming_join/join'
2
+
3
+ class LeftOuterJoin < Join
4
+ def output key
5
+ report 'keys'
6
+
7
+ left,right = @join
8
+ if not left
9
+ report 'null left'
10
+ return
11
+ else
12
+ report 'left keys'
13
+ end
14
+
15
+ # the number of columns on the right can be passed in as an env variable
16
+ # so that the full record with empty "" values can be displayed even if
17
+ # there is no match
18
+ cols_r = ENV['streaming_join_cols_right'].to_i
19
+
20
+ left.each do |l|
21
+ if right
22
+ report 'left and right'
23
+ right.each do |r|
24
+ o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
25
+ block_given? ? (yield o) : (puts o)
26
+ end
27
+ elsif cols_r > 0
28
+ report 'null right'
29
+ o = "#{key}#{@sep_out}"
30
+ o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
31
+ block_given? ? (yield o) : (puts o)
32
+ else
33
+ report 'null right'
34
+ o = "#{key}#{@sep_out}#{l}"
35
+ block_given? ? (yield o) : (puts o)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,29 @@
1
+ require 'streaming_join'
2
+
3
+ class MergeRows < Join
4
+ def output key
5
+ report 'keys'
6
+
7
+ o = "#{key}#{@sep_out}#{@join.join(@sep_out)}"
8
+ block_given? ? (yield o) : (puts o)
9
+ end
10
+
11
+ def process_stream(input = STDIN)
12
+ last_key = key = nil
13
+ @join = []
14
+
15
+ input.each do |line|
16
+ key, value = line.chomp.split(@sep_in, 2)
17
+
18
+ if last_key and last_key != key
19
+ output last_key
20
+ @join = []
21
+ end
22
+
23
+ @join << value
24
+ last_key = key
25
+ end
26
+
27
+ output last_key if key
28
+ end
29
+ end
@@ -0,0 +1,39 @@
1
+ require 'streaming_join/join'
2
+
3
+ class RightOuterJoin < Join
4
+ def output key
5
+ report 'keys'
6
+
7
+ left,right = @join
8
+ if not right
9
+ report 'null right'
10
+ return
11
+ else
12
+ report 'right keys'
13
+ end
14
+
15
+ # the number of columns on the left can be passed in as an env variable
16
+ # so that the full record with empty "" values can be displayed even if
17
+ # there is no match
18
+ cols_l = ENV['streaming_join_cols_left'].to_i
19
+
20
+ right.each do |r|
21
+ if left
22
+ report 'left and right'
23
+ left.each do |l|
24
+ o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
25
+ block_given? ? (yield o) : (puts o)
26
+ end
27
+ elsif cols_l > 0
28
+ report 'null left'
29
+ o = "#{key}#{@sep_out}"
30
+ o << "#{Array.new(cols_l).join(@sep_out)}#{@sep_out}#{r}"
31
+ block_given? ? (yield o) : (puts o)
32
+ else
33
+ report 'null left'
34
+ o = "#{key}#{@sep_out}#{r}"
35
+ block_given? ? (yield o) : (puts o)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,6 @@
1
+ require 'streaming_join/join.rb'
2
+ require 'streaming_join/left_outer_join.rb'
3
+ require 'streaming_join/right_outer_join.rb'
4
+ require 'streaming_join/full_outer_join.rb'
5
+ require 'streaming_join/cross_join.rb'
6
+ require 'streaming_join/merge_rows.rb'
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: streaming_join
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Frank Fejes
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-08-19 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Classes to process various joins in Hadoop Streaming.
22
+ email: frank@fejes.net
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - README
31
+ - CHANGELOG
32
+ - lib/streaming_join.rb
33
+ - lib/streaming_join/join.rb
34
+ - lib/streaming_join/left_outer_join.rb
35
+ - lib/streaming_join/right_outer_join.rb
36
+ - lib/streaming_join/full_outer_join.rb
37
+ - lib/streaming_join/cross_join.rb
38
+ - lib/streaming_join/merge_rows.rb
39
+ has_rdoc: true
40
+ homepage: https://github.com/fsfiii/streaming_join
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ requirements: []
65
+
66
+ rubyforge_project: streaming_join
67
+ rubygems_version: 1.3.7
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Classes to process joins in Hadoop Streaming
71
+ test_files: []
72
+