streaming_join 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +2 -0
- data/README +23 -0
- data/lib/streaming_join/cross_join.rb +30 -0
- data/lib/streaming_join/full_outer_join.rb +59 -0
- data/lib/streaming_join/join.rb +56 -0
- data/lib/streaming_join/left_outer_join.rb +39 -0
- data/lib/streaming_join/merge_rows.rb +29 -0
- data/lib/streaming_join/right_outer_join.rb +39 -0
- data/lib/streaming_join.rb +6 -0
- metadata +72 -0
data/CHANGELOG
ADDED
data/README
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
streaming_join
|
2
|
+
|
3
|
+
Ruby classes intended to be used in Hadoop Streaming reducers. It
|
4
|
+
has been tested with jruby 1.6+ and ruby 1.9.2+.
|
5
|
+
|
6
|
+
Examples (found in the examples directory) use the test data and
|
7
|
+
scenarios from here:
|
8
|
+
|
9
|
+
http://en.wikipedia.org/wiki/Join_(SQL)
|
10
|
+
|
11
|
+
The equivalent sql for each example is listed in its directory. The
|
12
|
+
supported join types are:
|
13
|
+
|
14
|
+
inner_join
|
15
|
+
cross_join
|
16
|
+
left_outer_join
|
17
|
+
right_outer_join
|
18
|
+
full_outer_join
|
19
|
+
merge_rows
|
20
|
+
|
21
|
+
Please let me know if you find this software useful!
|
22
|
+
|
23
|
+
--frank
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'streaming_join/join'
|
2
|
+
|
3
|
+
class CrossJoin < Join
|
4
|
+
def output
|
5
|
+
left,right = @join
|
6
|
+
|
7
|
+
left.each do |lk,lv|
|
8
|
+
right.each do |rk,rv|
|
9
|
+
o = "#{lk}#{@sep_out}#{lv}#{@sep_out}#{rk}#{@sep_out}#{rv}"
|
10
|
+
if block_given?
|
11
|
+
yield o
|
12
|
+
else
|
13
|
+
puts o
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def process_stream(input = STDIN, &blk)
|
20
|
+
@join = [] # big memory, big prizes
|
21
|
+
|
22
|
+
input.each do |line|
|
23
|
+
key, side, value = line.chomp.split(@sep_in, 3)
|
24
|
+
|
25
|
+
(@join[side.to_i] ||= []) << [key, value]
|
26
|
+
end
|
27
|
+
|
28
|
+
output(&blk) if not @join.empty?
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'streaming_join/join'
|
2
|
+
|
3
|
+
class FullOuterJoin < Join
|
4
|
+
def output key
|
5
|
+
report 'keys'
|
6
|
+
|
7
|
+
left,right = @join
|
8
|
+
if not left
|
9
|
+
report 'null left'
|
10
|
+
else
|
11
|
+
report 'left keys'
|
12
|
+
end
|
13
|
+
if not right
|
14
|
+
report 'null right'
|
15
|
+
else
|
16
|
+
report 'right keys'
|
17
|
+
end
|
18
|
+
|
19
|
+
#p left, right
|
20
|
+
|
21
|
+
# the number of columns on the sides can be passed in as env variables
|
22
|
+
# so that the full record with empty "" values can be displayed even if
|
23
|
+
# there is no match
|
24
|
+
cols_r = ENV['streaming_join_cols_right'].to_i
|
25
|
+
cols_l = ENV['streaming_join_cols_left'].to_i
|
26
|
+
|
27
|
+
left.each do |l|
|
28
|
+
if right
|
29
|
+
report 'left and right'
|
30
|
+
right.each do |r|
|
31
|
+
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
32
|
+
block_given? ? (yield o) : (puts o)
|
33
|
+
end
|
34
|
+
elsif cols_r > 0
|
35
|
+
report 'null right'
|
36
|
+
o = "#{key}#{@sep_out}"
|
37
|
+
o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
|
38
|
+
block_given? ? (yield o) : (puts o)
|
39
|
+
else
|
40
|
+
report 'null right'
|
41
|
+
o = "#{key}#{@sep_out}#{l}"
|
42
|
+
block_given? ? (yield o) : (puts o)
|
43
|
+
end
|
44
|
+
end if left
|
45
|
+
|
46
|
+
right.each do |r|
|
47
|
+
next if left
|
48
|
+
report 'null left'
|
49
|
+
if cols_l > 0
|
50
|
+
o = "#{key}#{@sep_out}"
|
51
|
+
o << "#{Array.new(cols_l).join(@sep_out)}#{@sep_out}#{r}"
|
52
|
+
else
|
53
|
+
o = "#{key}#{@sep_out}#{r}"
|
54
|
+
end
|
55
|
+
|
56
|
+
block_given? ? (yield o) : (puts o)
|
57
|
+
end if right
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# base Join class (inner join)
|
2
|
+
class Join
|
3
|
+
def initialize
|
4
|
+
@sep_in = ENV['stream_map_output_field_separator'] || "\t"
|
5
|
+
@sep_out = ENV['streaming_join_output_separator'] || "\t"
|
6
|
+
@sep_out = $1.hex.chr if @sep_out =~ /\A(?:\\u?)?(\d+)\Z/
|
7
|
+
end
|
8
|
+
|
9
|
+
def report detail
|
10
|
+
STDERR.puts "reporter:counter:join,#{detail},1"
|
11
|
+
end
|
12
|
+
|
13
|
+
def output key
|
14
|
+
report 'keys'
|
15
|
+
|
16
|
+
left,right = @join
|
17
|
+
if not left
|
18
|
+
report 'null left'
|
19
|
+
return
|
20
|
+
elsif not right
|
21
|
+
report 'null right'
|
22
|
+
return
|
23
|
+
end
|
24
|
+
|
25
|
+
left.each do |l|
|
26
|
+
report 'left and right'
|
27
|
+
right.each do |r|
|
28
|
+
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
29
|
+
if block_given?
|
30
|
+
yield o
|
31
|
+
else
|
32
|
+
puts o
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def process_stream(input = STDIN, &blk)
|
39
|
+
last_key = key = nil
|
40
|
+
@join = []
|
41
|
+
|
42
|
+
input.each do |line|
|
43
|
+
key, side, value = line.chomp.split(@sep_in, 3)
|
44
|
+
|
45
|
+
if last_key and last_key != key
|
46
|
+
output(last_key, &blk)
|
47
|
+
@join = []
|
48
|
+
end
|
49
|
+
|
50
|
+
(@join[side.to_i] ||= []) << value
|
51
|
+
last_key = key
|
52
|
+
end
|
53
|
+
|
54
|
+
output(last_key, &blk) if key
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'streaming_join/join'
|
2
|
+
|
3
|
+
class LeftOuterJoin < Join
|
4
|
+
def output key
|
5
|
+
report 'keys'
|
6
|
+
|
7
|
+
left,right = @join
|
8
|
+
if not left
|
9
|
+
report 'null left'
|
10
|
+
return
|
11
|
+
else
|
12
|
+
report 'left keys'
|
13
|
+
end
|
14
|
+
|
15
|
+
# the number of columns on the right can be passed in as an env variable
|
16
|
+
# so that the full record with empty "" values can be displayed even if
|
17
|
+
# there is no match
|
18
|
+
cols_r = ENV['streaming_join_cols_right'].to_i
|
19
|
+
|
20
|
+
left.each do |l|
|
21
|
+
if right
|
22
|
+
report 'left and right'
|
23
|
+
right.each do |r|
|
24
|
+
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
25
|
+
block_given? ? (yield o) : (puts o)
|
26
|
+
end
|
27
|
+
elsif cols_r > 0
|
28
|
+
report 'null right'
|
29
|
+
o = "#{key}#{@sep_out}"
|
30
|
+
o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
|
31
|
+
block_given? ? (yield o) : (puts o)
|
32
|
+
else
|
33
|
+
report 'null right'
|
34
|
+
o = "#{key}#{@sep_out}#{l}"
|
35
|
+
block_given? ? (yield o) : (puts o)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'streaming_join'
|
2
|
+
|
3
|
+
class MergeRows < Join
|
4
|
+
def output key
|
5
|
+
report 'keys'
|
6
|
+
|
7
|
+
o = "#{key}#{@sep_out}#{@join.join(@sep_out)}"
|
8
|
+
block_given? ? (yield o) : (puts o)
|
9
|
+
end
|
10
|
+
|
11
|
+
def process_stream(input = STDIN)
|
12
|
+
last_key = key = nil
|
13
|
+
@join = []
|
14
|
+
|
15
|
+
input.each do |line|
|
16
|
+
key, value = line.chomp.split(@sep_in, 2)
|
17
|
+
|
18
|
+
if last_key and last_key != key
|
19
|
+
output last_key
|
20
|
+
@join = []
|
21
|
+
end
|
22
|
+
|
23
|
+
@join << value
|
24
|
+
last_key = key
|
25
|
+
end
|
26
|
+
|
27
|
+
output last_key if key
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'streaming_join/join'
|
2
|
+
|
3
|
+
class RightOuterJoin < Join
|
4
|
+
def output key
|
5
|
+
report 'keys'
|
6
|
+
|
7
|
+
left,right = @join
|
8
|
+
if not right
|
9
|
+
report 'null right'
|
10
|
+
return
|
11
|
+
else
|
12
|
+
report 'right keys'
|
13
|
+
end
|
14
|
+
|
15
|
+
# the number of columns on the left can be passed in as an env variable
|
16
|
+
# so that the full record with empty "" values can be displayed even if
|
17
|
+
# there is no match
|
18
|
+
cols_l = ENV['streaming_join_cols_left'].to_i
|
19
|
+
|
20
|
+
right.each do |r|
|
21
|
+
if left
|
22
|
+
report 'left and right'
|
23
|
+
left.each do |l|
|
24
|
+
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
25
|
+
block_given? ? (yield o) : (puts o)
|
26
|
+
end
|
27
|
+
elsif cols_l > 0
|
28
|
+
report 'null left'
|
29
|
+
o = "#{key}#{@sep_out}"
|
30
|
+
o << "#{Array.new(cols_l).join(@sep_out)}#{@sep_out}#{r}"
|
31
|
+
block_given? ? (yield o) : (puts o)
|
32
|
+
else
|
33
|
+
report 'null left'
|
34
|
+
o = "#{key}#{@sep_out}#{r}"
|
35
|
+
block_given? ? (yield o) : (puts o)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: streaming_join
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Frank Fejes
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-08-19 00:00:00 -05:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Classes to process various joins in Hadoop Streaming.
|
22
|
+
email: frank@fejes.net
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- README
|
31
|
+
- CHANGELOG
|
32
|
+
- lib/streaming_join.rb
|
33
|
+
- lib/streaming_join/join.rb
|
34
|
+
- lib/streaming_join/left_outer_join.rb
|
35
|
+
- lib/streaming_join/right_outer_join.rb
|
36
|
+
- lib/streaming_join/full_outer_join.rb
|
37
|
+
- lib/streaming_join/cross_join.rb
|
38
|
+
- lib/streaming_join/merge_rows.rb
|
39
|
+
has_rdoc: true
|
40
|
+
homepage: https://github.com/fsfiii/streaming_join
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options: []
|
45
|
+
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
segments:
|
54
|
+
- 0
|
55
|
+
version: "0"
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
segments:
|
62
|
+
- 0
|
63
|
+
version: "0"
|
64
|
+
requirements: []
|
65
|
+
|
66
|
+
rubyforge_project: streaming_join
|
67
|
+
rubygems_version: 1.3.7
|
68
|
+
signing_key:
|
69
|
+
specification_version: 3
|
70
|
+
summary: Classes to process joins in Hadoop Streaming
|
71
|
+
test_files: []
|
72
|
+
|