streaming_join 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG ADDED
@@ -0,0 +1,2 @@
1
+ * 2011-08-19 - fsf
2
+ - initial import
data/README ADDED
@@ -0,0 +1,23 @@
1
+ streaming_join
2
+
3
+ Ruby classes intended to be used in Hadoop Streaming reducers. It
4
+ has been tested with jruby 1.6+ and ruby 1.9.2+.
5
+
6
+ Examples (found in the examples directory) use the test data and
7
+ scenarios from here:
8
+
9
+ http://en.wikipedia.org/wiki/Join_(SQL)
10
+
11
+ The equivalent sql for each example is listed in its directory. The
12
+ supported join types are:
13
+
14
+ inner_join
15
+ cross_join
16
+ left_outer_join
17
+ right_outer_join
18
+ full_outer_join
19
+ merge_rows
20
+
21
+ Please let me know if you find this software useful!
22
+
23
+ --frank
@@ -0,0 +1,30 @@
1
+ require 'streaming_join/join'
2
+
3
+ class CrossJoin < Join
4
+ def output
5
+ left,right = @join
6
+
7
+ left.each do |lk,lv|
8
+ right.each do |rk,rv|
9
+ o = "#{lk}#{@sep_out}#{lv}#{@sep_out}#{rk}#{@sep_out}#{rv}"
10
+ if block_given?
11
+ yield o
12
+ else
13
+ puts o
14
+ end
15
+ end
16
+ end
17
+ end
18
+
19
+ def process_stream(input = STDIN, &blk)
20
+ @join = [] # big memory, big prizes
21
+
22
+ input.each do |line|
23
+ key, side, value = line.chomp.split(@sep_in, 3)
24
+
25
+ (@join[side.to_i] ||= []) << [key, value]
26
+ end
27
+
28
+ output(&blk) if not @join.empty?
29
+ end
30
+ end
@@ -0,0 +1,59 @@
1
+ require 'streaming_join/join'
2
+
3
+ class FullOuterJoin < Join
4
+ def output key
5
+ report 'keys'
6
+
7
+ left,right = @join
8
+ if not left
9
+ report 'null left'
10
+ else
11
+ report 'left keys'
12
+ end
13
+ if not right
14
+ report 'null right'
15
+ else
16
+ report 'right keys'
17
+ end
18
+
19
+ #p left, right
20
+
21
+ # the number of columns on the sides can be passed in as env variables
22
+ # so that the full record with empty "" values can be displayed even if
23
+ # there is no match
24
+ cols_r = ENV['streaming_join_cols_right'].to_i
25
+ cols_l = ENV['streaming_join_cols_left'].to_i
26
+
27
+ left.each do |l|
28
+ if right
29
+ report 'left and right'
30
+ right.each do |r|
31
+ o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
32
+ block_given? ? (yield o) : (puts o)
33
+ end
34
+ elsif cols_r > 0
35
+ report 'null right'
36
+ o = "#{key}#{@sep_out}"
37
+ o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
38
+ block_given? ? (yield o) : (puts o)
39
+ else
40
+ report 'null right'
41
+ o = "#{key}#{@sep_out}#{l}"
42
+ block_given? ? (yield o) : (puts o)
43
+ end
44
+ end if left
45
+
46
+ right.each do |r|
47
+ next if left
48
+ report 'null left'
49
+ if cols_l > 0
50
+ o = "#{key}#{@sep_out}"
51
+ o << "#{Array.new(cols_l).join(@sep_out)}#{@sep_out}#{r}"
52
+ else
53
+ o = "#{key}#{@sep_out}#{r}"
54
+ end
55
+
56
+ block_given? ? (yield o) : (puts o)
57
+ end if right
58
+ end
59
+ end
@@ -0,0 +1,56 @@
1
+ # base Join class (inner join)
2
+ class Join
3
+ def initialize
4
+ @sep_in = ENV['stream_map_output_field_separator'] || "\t"
5
+ @sep_out = ENV['streaming_join_output_separator'] || "\t"
6
+ @sep_out = $1.hex.chr if @sep_out =~ /\A(?:\\u?)?(\d+)\Z/
7
+ end
8
+
9
+ def report detail
10
+ STDERR.puts "reporter:counter:join,#{detail},1"
11
+ end
12
+
13
+ def output key
14
+ report 'keys'
15
+
16
+ left,right = @join
17
+ if not left
18
+ report 'null left'
19
+ return
20
+ elsif not right
21
+ report 'null right'
22
+ return
23
+ end
24
+
25
+ left.each do |l|
26
+ report 'left and right'
27
+ right.each do |r|
28
+ o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
29
+ if block_given?
30
+ yield o
31
+ else
32
+ puts o
33
+ end
34
+ end
35
+ end
36
+ end
37
+
38
+ def process_stream(input = STDIN, &blk)
39
+ last_key = key = nil
40
+ @join = []
41
+
42
+ input.each do |line|
43
+ key, side, value = line.chomp.split(@sep_in, 3)
44
+
45
+ if last_key and last_key != key
46
+ output(last_key, &blk)
47
+ @join = []
48
+ end
49
+
50
+ (@join[side.to_i] ||= []) << value
51
+ last_key = key
52
+ end
53
+
54
+ output(last_key, &blk) if key
55
+ end
56
+ end
@@ -0,0 +1,39 @@
1
+ require 'streaming_join/join'
2
+
3
+ class LeftOuterJoin < Join
4
+ def output key
5
+ report 'keys'
6
+
7
+ left,right = @join
8
+ if not left
9
+ report 'null left'
10
+ return
11
+ else
12
+ report 'left keys'
13
+ end
14
+
15
+ # the number of columns on the right can be passed in as an env variable
16
+ # so that the full record with empty "" values can be displayed even if
17
+ # there is no match
18
+ cols_r = ENV['streaming_join_cols_right'].to_i
19
+
20
+ left.each do |l|
21
+ if right
22
+ report 'left and right'
23
+ right.each do |r|
24
+ o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
25
+ block_given? ? (yield o) : (puts o)
26
+ end
27
+ elsif cols_r > 0
28
+ report 'null right'
29
+ o = "#{key}#{@sep_out}"
30
+ o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
31
+ block_given? ? (yield o) : (puts o)
32
+ else
33
+ report 'null right'
34
+ o = "#{key}#{@sep_out}#{l}"
35
+ block_given? ? (yield o) : (puts o)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,29 @@
1
+ require 'streaming_join'
2
+
3
+ class MergeRows < Join
4
+ def output key
5
+ report 'keys'
6
+
7
+ o = "#{key}#{@sep_out}#{@join.join(@sep_out)}"
8
+ block_given? ? (yield o) : (puts o)
9
+ end
10
+
11
+ def process_stream(input = STDIN)
12
+ last_key = key = nil
13
+ @join = []
14
+
15
+ input.each do |line|
16
+ key, value = line.chomp.split(@sep_in, 2)
17
+
18
+ if last_key and last_key != key
19
+ output last_key
20
+ @join = []
21
+ end
22
+
23
+ @join << value
24
+ last_key = key
25
+ end
26
+
27
+ output last_key if key
28
+ end
29
+ end
@@ -0,0 +1,39 @@
1
+ require 'streaming_join/join'
2
+
3
+ class RightOuterJoin < Join
4
+ def output key
5
+ report 'keys'
6
+
7
+ left,right = @join
8
+ if not right
9
+ report 'null right'
10
+ return
11
+ else
12
+ report 'right keys'
13
+ end
14
+
15
+ # the number of columns on the left can be passed in as an env variable
16
+ # so that the full record with empty "" values can be displayed even if
17
+ # there is no match
18
+ cols_l = ENV['streaming_join_cols_left'].to_i
19
+
20
+ right.each do |r|
21
+ if left
22
+ report 'left and right'
23
+ left.each do |l|
24
+ o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
25
+ block_given? ? (yield o) : (puts o)
26
+ end
27
+ elsif cols_l > 0
28
+ report 'null left'
29
+ o = "#{key}#{@sep_out}"
30
+ o << "#{Array.new(cols_l).join(@sep_out)}#{@sep_out}#{r}"
31
+ block_given? ? (yield o) : (puts o)
32
+ else
33
+ report 'null left'
34
+ o = "#{key}#{@sep_out}#{r}"
35
+ block_given? ? (yield o) : (puts o)
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,6 @@
1
+ require 'streaming_join/join.rb'
2
+ require 'streaming_join/left_outer_join.rb'
3
+ require 'streaming_join/right_outer_join.rb'
4
+ require 'streaming_join/full_outer_join.rb'
5
+ require 'streaming_join/cross_join.rb'
6
+ require 'streaming_join/merge_rows.rb'
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: streaming_join
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Frank Fejes
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-08-19 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Classes to process various joins in Hadoop Streaming.
22
+ email: frank@fejes.net
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - README
31
+ - CHANGELOG
32
+ - lib/streaming_join.rb
33
+ - lib/streaming_join/join.rb
34
+ - lib/streaming_join/left_outer_join.rb
35
+ - lib/streaming_join/right_outer_join.rb
36
+ - lib/streaming_join/full_outer_join.rb
37
+ - lib/streaming_join/cross_join.rb
38
+ - lib/streaming_join/merge_rows.rb
39
+ has_rdoc: true
40
+ homepage: https://github.com/fsfiii/streaming_join
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ requirements: []
65
+
66
+ rubyforge_project: streaming_join
67
+ rubygems_version: 1.3.7
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Classes to process joins in Hadoop Streaming
71
+ test_files: []
72
+