streaming_join 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +2 -0
- data/README +23 -0
- data/lib/streaming_join/cross_join.rb +30 -0
- data/lib/streaming_join/full_outer_join.rb +59 -0
- data/lib/streaming_join/join.rb +56 -0
- data/lib/streaming_join/left_outer_join.rb +39 -0
- data/lib/streaming_join/merge_rows.rb +29 -0
- data/lib/streaming_join/right_outer_join.rb +39 -0
- data/lib/streaming_join.rb +6 -0
- metadata +72 -0
data/CHANGELOG
ADDED
data/README
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
streaming_join
|
2
|
+
|
3
|
+
Ruby classes intended to be used in Hadoop Streaming reducers. It
|
4
|
+
has been tested with jruby 1.6+ and ruby 1.9.2+.
|
5
|
+
|
6
|
+
Examples (found in the examples directory) use the test data and
|
7
|
+
scenarios from here:
|
8
|
+
|
9
|
+
http://en.wikipedia.org/wiki/Join_(SQL)
|
10
|
+
|
11
|
+
The equivalent sql for each example is listed in its directory. The
|
12
|
+
supported join types are:
|
13
|
+
|
14
|
+
inner_join
|
15
|
+
cross_join
|
16
|
+
left_outer_join
|
17
|
+
right_outer_join
|
18
|
+
full_outer_join
|
19
|
+
merge_rows
|
20
|
+
|
21
|
+
Please let me know if you find this software useful!
|
22
|
+
|
23
|
+
--frank
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'streaming_join/join'
|
2
|
+
|
3
|
+
class CrossJoin < Join
|
4
|
+
def output
|
5
|
+
left,right = @join
|
6
|
+
|
7
|
+
left.each do |lk,lv|
|
8
|
+
right.each do |rk,rv|
|
9
|
+
o = "#{lk}#{@sep_out}#{lv}#{@sep_out}#{rk}#{@sep_out}#{rv}"
|
10
|
+
if block_given?
|
11
|
+
yield o
|
12
|
+
else
|
13
|
+
puts o
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def process_stream(input = STDIN, &blk)
|
20
|
+
@join = [] # big memory, big prizes
|
21
|
+
|
22
|
+
input.each do |line|
|
23
|
+
key, side, value = line.chomp.split(@sep_in, 3)
|
24
|
+
|
25
|
+
(@join[side.to_i] ||= []) << [key, value]
|
26
|
+
end
|
27
|
+
|
28
|
+
output(&blk) if not @join.empty?
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'streaming_join/join'
|
2
|
+
|
3
|
+
class FullOuterJoin < Join
|
4
|
+
def output key
|
5
|
+
report 'keys'
|
6
|
+
|
7
|
+
left,right = @join
|
8
|
+
if not left
|
9
|
+
report 'null left'
|
10
|
+
else
|
11
|
+
report 'left keys'
|
12
|
+
end
|
13
|
+
if not right
|
14
|
+
report 'null right'
|
15
|
+
else
|
16
|
+
report 'right keys'
|
17
|
+
end
|
18
|
+
|
19
|
+
#p left, right
|
20
|
+
|
21
|
+
# the number of columns on the sides can be passed in as env variables
|
22
|
+
# so that the full record with empty "" values can be displayed even if
|
23
|
+
# there is no match
|
24
|
+
cols_r = ENV['streaming_join_cols_right'].to_i
|
25
|
+
cols_l = ENV['streaming_join_cols_left'].to_i
|
26
|
+
|
27
|
+
left.each do |l|
|
28
|
+
if right
|
29
|
+
report 'left and right'
|
30
|
+
right.each do |r|
|
31
|
+
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
32
|
+
block_given? ? (yield o) : (puts o)
|
33
|
+
end
|
34
|
+
elsif cols_r > 0
|
35
|
+
report 'null right'
|
36
|
+
o = "#{key}#{@sep_out}"
|
37
|
+
o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
|
38
|
+
block_given? ? (yield o) : (puts o)
|
39
|
+
else
|
40
|
+
report 'null right'
|
41
|
+
o = "#{key}#{@sep_out}#{l}"
|
42
|
+
block_given? ? (yield o) : (puts o)
|
43
|
+
end
|
44
|
+
end if left
|
45
|
+
|
46
|
+
right.each do |r|
|
47
|
+
next if left
|
48
|
+
report 'null left'
|
49
|
+
if cols_l > 0
|
50
|
+
o = "#{key}#{@sep_out}"
|
51
|
+
o << "#{Array.new(cols_l).join(@sep_out)}#{@sep_out}#{r}"
|
52
|
+
else
|
53
|
+
o = "#{key}#{@sep_out}#{r}"
|
54
|
+
end
|
55
|
+
|
56
|
+
block_given? ? (yield o) : (puts o)
|
57
|
+
end if right
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# base Join class (inner join)
|
2
|
+
class Join
|
3
|
+
def initialize
|
4
|
+
@sep_in = ENV['stream_map_output_field_separator'] || "\t"
|
5
|
+
@sep_out = ENV['streaming_join_output_separator'] || "\t"
|
6
|
+
@sep_out = $1.hex.chr if @sep_out =~ /\A(?:\\u?)?(\d+)\Z/
|
7
|
+
end
|
8
|
+
|
9
|
+
def report detail
|
10
|
+
STDERR.puts "reporter:counter:join,#{detail},1"
|
11
|
+
end
|
12
|
+
|
13
|
+
def output key
|
14
|
+
report 'keys'
|
15
|
+
|
16
|
+
left,right = @join
|
17
|
+
if not left
|
18
|
+
report 'null left'
|
19
|
+
return
|
20
|
+
elsif not right
|
21
|
+
report 'null right'
|
22
|
+
return
|
23
|
+
end
|
24
|
+
|
25
|
+
left.each do |l|
|
26
|
+
report 'left and right'
|
27
|
+
right.each do |r|
|
28
|
+
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
29
|
+
if block_given?
|
30
|
+
yield o
|
31
|
+
else
|
32
|
+
puts o
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def process_stream(input = STDIN, &blk)
|
39
|
+
last_key = key = nil
|
40
|
+
@join = []
|
41
|
+
|
42
|
+
input.each do |line|
|
43
|
+
key, side, value = line.chomp.split(@sep_in, 3)
|
44
|
+
|
45
|
+
if last_key and last_key != key
|
46
|
+
output(last_key, &blk)
|
47
|
+
@join = []
|
48
|
+
end
|
49
|
+
|
50
|
+
(@join[side.to_i] ||= []) << value
|
51
|
+
last_key = key
|
52
|
+
end
|
53
|
+
|
54
|
+
output(last_key, &blk) if key
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'streaming_join/join'
|
2
|
+
|
3
|
+
class LeftOuterJoin < Join
|
4
|
+
def output key
|
5
|
+
report 'keys'
|
6
|
+
|
7
|
+
left,right = @join
|
8
|
+
if not left
|
9
|
+
report 'null left'
|
10
|
+
return
|
11
|
+
else
|
12
|
+
report 'left keys'
|
13
|
+
end
|
14
|
+
|
15
|
+
# the number of columns on the right can be passed in as an env variable
|
16
|
+
# so that the full record with empty "" values can be displayed even if
|
17
|
+
# there is no match
|
18
|
+
cols_r = ENV['streaming_join_cols_right'].to_i
|
19
|
+
|
20
|
+
left.each do |l|
|
21
|
+
if right
|
22
|
+
report 'left and right'
|
23
|
+
right.each do |r|
|
24
|
+
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
25
|
+
block_given? ? (yield o) : (puts o)
|
26
|
+
end
|
27
|
+
elsif cols_r > 0
|
28
|
+
report 'null right'
|
29
|
+
o = "#{key}#{@sep_out}"
|
30
|
+
o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
|
31
|
+
block_given? ? (yield o) : (puts o)
|
32
|
+
else
|
33
|
+
report 'null right'
|
34
|
+
o = "#{key}#{@sep_out}#{l}"
|
35
|
+
block_given? ? (yield o) : (puts o)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'streaming_join'
|
2
|
+
|
3
|
+
class MergeRows < Join
|
4
|
+
def output key
|
5
|
+
report 'keys'
|
6
|
+
|
7
|
+
o = "#{key}#{@sep_out}#{@join.join(@sep_out)}"
|
8
|
+
block_given? ? (yield o) : (puts o)
|
9
|
+
end
|
10
|
+
|
11
|
+
def process_stream(input = STDIN)
|
12
|
+
last_key = key = nil
|
13
|
+
@join = []
|
14
|
+
|
15
|
+
input.each do |line|
|
16
|
+
key, value = line.chomp.split(@sep_in, 2)
|
17
|
+
|
18
|
+
if last_key and last_key != key
|
19
|
+
output last_key
|
20
|
+
@join = []
|
21
|
+
end
|
22
|
+
|
23
|
+
@join << value
|
24
|
+
last_key = key
|
25
|
+
end
|
26
|
+
|
27
|
+
output last_key if key
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'streaming_join/join'
|
2
|
+
|
3
|
+
class RightOuterJoin < Join
|
4
|
+
def output key
|
5
|
+
report 'keys'
|
6
|
+
|
7
|
+
left,right = @join
|
8
|
+
if not right
|
9
|
+
report 'null right'
|
10
|
+
return
|
11
|
+
else
|
12
|
+
report 'right keys'
|
13
|
+
end
|
14
|
+
|
15
|
+
# the number of columns on the left can be passed in as an env variable
|
16
|
+
# so that the full record with empty "" values can be displayed even if
|
17
|
+
# there is no match
|
18
|
+
cols_l = ENV['streaming_join_cols_left'].to_i
|
19
|
+
|
20
|
+
right.each do |r|
|
21
|
+
if left
|
22
|
+
report 'left and right'
|
23
|
+
left.each do |l|
|
24
|
+
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
25
|
+
block_given? ? (yield o) : (puts o)
|
26
|
+
end
|
27
|
+
elsif cols_l > 0
|
28
|
+
report 'null left'
|
29
|
+
o = "#{key}#{@sep_out}"
|
30
|
+
o << "#{Array.new(cols_l).join(@sep_out)}#{@sep_out}#{r}"
|
31
|
+
block_given? ? (yield o) : (puts o)
|
32
|
+
else
|
33
|
+
report 'null left'
|
34
|
+
o = "#{key}#{@sep_out}#{r}"
|
35
|
+
block_given? ? (yield o) : (puts o)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: streaming_join
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Frank Fejes
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-08-19 00:00:00 -05:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Classes to process various joins in Hadoop Streaming.
|
22
|
+
email: frank@fejes.net
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- README
|
31
|
+
- CHANGELOG
|
32
|
+
- lib/streaming_join.rb
|
33
|
+
- lib/streaming_join/join.rb
|
34
|
+
- lib/streaming_join/left_outer_join.rb
|
35
|
+
- lib/streaming_join/right_outer_join.rb
|
36
|
+
- lib/streaming_join/full_outer_join.rb
|
37
|
+
- lib/streaming_join/cross_join.rb
|
38
|
+
- lib/streaming_join/merge_rows.rb
|
39
|
+
has_rdoc: true
|
40
|
+
homepage: https://github.com/fsfiii/streaming_join
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options: []
|
45
|
+
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
segments:
|
54
|
+
- 0
|
55
|
+
version: "0"
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
requirements: []
|
65
|
+
|
66
|
+
rubyforge_project: streaming_join
|
67
|
+
rubygems_version: 1.3.7
|
68
|
+
signing_key:
|
69
|
+
specification_version: 3
|
70
|
+
summary: Classes to process joins in Hadoop Streaming
|
71
|
+
test_files: []
|
72
|
+
|