streaming_join 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +11 -0
- data/README +58 -0
- data/lib/streaming_join/full_outer_join.rb +17 -47
- data/lib/streaming_join/join.rb +52 -27
- data/lib/streaming_join/left_outer_join.rb +6 -28
- data/lib/streaming_join/right_outer_join.rb +6 -30
- metadata +3 -3
data/CHANGELOG
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
* 2011-08-27 - fsfiii
|
2
|
+
- update: Redesigned join logic so as to not read individual keyspaces
|
3
|
+
into memory since this simply won't scale. Now only the left
|
4
|
+
side of a keyspace will be placed into memory (one keyspace at
|
5
|
+
a time, only) and the right side will be processed as values are
|
6
|
+
read in. This now requires KeyFieldBasedPartioner so that the
|
7
|
+
values are sorted left side first.
|
8
|
+
- clean: removed example/job and renamed example/job_full to example/job
|
9
|
+
- new: add opts passing to Join class
|
10
|
+
- new: add :report opt for Join class to control reporting of key stats
|
11
|
+
|
1
12
|
* 2011-08-22 - fsfiii
|
2
13
|
- new: add JoinMapper class, intended to be used as the mapper portion
|
3
14
|
- new: added examples/job_full which runs an entire job with both map and
|
data/README
CHANGED
@@ -18,6 +18,64 @@ right_outer_join
|
|
18
18
|
full_outer_join
|
19
19
|
merge_rows
|
20
20
|
|
21
|
+
API
|
22
|
+
|
23
|
+
As with most map/reduce jobs, there are two basic components:
|
24
|
+
|
25
|
+
Mapper
|
26
|
+
|
27
|
+
The mapper in a streaming_join job outputs records of the form:
|
28
|
+
|
29
|
+
key TAB side_index TAB value
|
30
|
+
|
31
|
+
The "left" side uses side_index 0 while the right side would use 1.
|
32
|
+
|
33
|
+
In a simple join, here is one example mapper output:
|
34
|
+
|
35
|
+
1 0 Matsumoto
|
36
|
+
2 0 Wall
|
37
|
+
3 0 van Rossum
|
38
|
+
|
39
|
+
Here is the other side's output:
|
40
|
+
|
41
|
+
1 1 Ruby 1995
|
42
|
+
2 1 Perl 1987
|
43
|
+
4 1 Clojure 2007
|
44
|
+
|
45
|
+
Reducer
|
46
|
+
|
47
|
+
Depending on the join style, the reducer will emit combined records. Using
|
48
|
+
the above mapper output, an inner join would end up like:
|
49
|
+
|
50
|
+
1 Matsumoto Ruby
|
51
|
+
2 Wall Perl
|
52
|
+
|
53
|
+
The code would look something like this:
|
54
|
+
|
55
|
+
Mapper
|
56
|
+
|
57
|
+
require 'streaming_join'
|
58
|
+
|
59
|
+
j = JoinMapper.new
|
60
|
+
j.add_side(/_left/, 0, 1)
|
61
|
+
j.add_side(/_right/, 0, 1)
|
62
|
+
j.process_stream
|
63
|
+
|
64
|
+
Reducer
|
65
|
+
|
66
|
+
require 'streaming_join'
|
67
|
+
|
68
|
+
Join.new.process_stream
|
69
|
+
|
70
|
+
See examples/job for more detail and comments.
|
71
|
+
|
72
|
+
Current Limitations
|
73
|
+
|
74
|
+
- As each key is processed in a reducer, the left side of that single
|
75
|
+
keyspace must fit into memory. So, when in doubt, put the smaller
|
76
|
+
table on the left.
|
77
|
+
- Only two tables can be joined in a single job.
|
78
|
+
|
21
79
|
Please let me know if you find this software useful!
|
22
80
|
|
23
81
|
--frank
|
@@ -1,59 +1,29 @@
|
|
1
1
|
require 'streaming_join/join'
|
2
2
|
|
3
3
|
class FullOuterJoin < Join
|
4
|
-
def
|
5
|
-
|
6
|
-
|
7
|
-
left,right = @join
|
8
|
-
if not left
|
9
|
-
report 'null left'
|
10
|
-
else
|
11
|
-
report 'left keys'
|
12
|
-
end
|
13
|
-
if not right
|
14
|
-
report 'null right'
|
15
|
-
else
|
16
|
-
report 'right keys'
|
17
|
-
end
|
18
|
-
|
19
|
-
#p left, right
|
20
|
-
|
21
|
-
# the number of columns on the sides can be passed in as env variables
|
22
|
-
# so that the full record with empty "" values can be displayed even if
|
23
|
-
# there is no match
|
24
|
-
cols_r = ENV['streaming_join_cols_right'].to_i
|
25
|
-
cols_l = ENV['streaming_join_cols_left'].to_i
|
4
|
+
def null_right key, left
|
5
|
+
o = "#{key}#{@sep_out}"
|
26
6
|
|
27
7
|
left.each do |l|
|
28
|
-
if
|
29
|
-
|
30
|
-
right.each do |r|
|
31
|
-
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
32
|
-
block_given? ? (yield o) : (puts o)
|
33
|
-
end
|
34
|
-
elsif cols_r > 0
|
35
|
-
report 'null right'
|
36
|
-
o = "#{key}#{@sep_out}"
|
37
|
-
o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
|
38
|
-
block_given? ? (yield o) : (puts o)
|
8
|
+
if @cols_r > 0
|
9
|
+
o << "#{l}#{@sep_out}#{Array.new(@cols_r).join(@sep_out)}"
|
39
10
|
else
|
40
|
-
report 'null right'
|
41
11
|
o = "#{key}#{@sep_out}#{l}"
|
42
|
-
block_given? ? (yield o) : (puts o)
|
43
12
|
end
|
44
|
-
end
|
13
|
+
end
|
45
14
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
15
|
+
block_given? ? (yield o) : (puts o)
|
16
|
+
end
|
17
|
+
|
18
|
+
def null_left key, right
|
19
|
+
o = "#{key}#{@sep_out}"
|
20
|
+
|
21
|
+
if @cols_l > 0
|
22
|
+
o << "#{Array.new(@cols_l).join(@sep_out)}#{@sep_out}#{right}"
|
23
|
+
else
|
24
|
+
o = "#{key}#{@sep_out}#{right}"
|
25
|
+
end
|
55
26
|
|
56
|
-
|
57
|
-
end if right
|
27
|
+
block_given? ? (yield o) : (puts o)
|
58
28
|
end
|
59
29
|
end
|
data/lib/streaming_join/join.rb
CHANGED
@@ -1,56 +1,81 @@
|
|
1
1
|
# base Join class (inner join)
|
2
2
|
class Join
|
3
|
-
def initialize
|
3
|
+
def initialize opts = {}
|
4
4
|
@sep_in = ENV['stream_map_output_field_separator'] || "\t"
|
5
5
|
@sep_out = ENV['streaming_join_output_separator'] || "\t"
|
6
6
|
@sep_out = $1.hex.chr if @sep_out =~ /\A(?:\\u?)?(\d+)\Z/
|
7
|
+
@report = opts.fetch :report, true
|
8
|
+
@cols_l = ENV['streaming_join_cols_left'].to_i
|
9
|
+
@cols_r = ENV['streaming_join_cols_right'].to_i
|
7
10
|
end
|
8
11
|
|
9
12
|
def report detail
|
13
|
+
return if not @report
|
10
14
|
STDERR.puts "reporter:counter:join,#{detail},1"
|
11
15
|
end
|
12
16
|
|
13
|
-
def output key
|
14
|
-
report 'keys'
|
15
|
-
|
16
|
-
left,right = @join
|
17
|
-
if not left
|
18
|
-
report 'null left'
|
19
|
-
return
|
20
|
-
elsif not right
|
21
|
-
report 'null right'
|
22
|
-
return
|
23
|
-
end
|
24
|
-
|
17
|
+
def output key, left, right
|
25
18
|
left.each do |l|
|
26
|
-
|
27
|
-
|
28
|
-
o
|
29
|
-
|
30
|
-
|
31
|
-
else
|
32
|
-
puts o
|
33
|
-
end
|
19
|
+
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{right}"
|
20
|
+
if block_given?
|
21
|
+
yield o
|
22
|
+
else
|
23
|
+
puts o
|
34
24
|
end
|
35
25
|
end
|
36
26
|
end
|
37
27
|
|
28
|
+
def null_left key, right
|
29
|
+
end
|
30
|
+
|
31
|
+
def null_right key, left
|
32
|
+
end
|
33
|
+
|
38
34
|
def process_stream(input = STDIN, &blk)
|
39
35
|
last_key = key = nil
|
40
|
-
|
36
|
+
last_side = nil
|
37
|
+
left = []
|
41
38
|
|
42
39
|
input.each do |line|
|
43
40
|
key, side, value = line.chomp.split(@sep_in, 3)
|
44
41
|
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
report 'keys' if last_key != key
|
43
|
+
|
44
|
+
side = side.to_i
|
45
|
+
if side == 0
|
46
|
+
# if we are on the left side and just processed the left side
|
47
|
+
# of another key, we didn't get any right side records
|
48
|
+
if last_key != key and last_side == 0
|
49
|
+
report 'null right'
|
50
|
+
null_right last_key, left
|
51
|
+
end
|
52
|
+
|
53
|
+
if last_key != key
|
54
|
+
left = []
|
55
|
+
end
|
56
|
+
left << value
|
57
|
+
else
|
58
|
+
# if we're in a new key and the first record is a right side
|
59
|
+
# record, that means we never processed a left side
|
60
|
+
if not last_key or last_key != key or left.empty?
|
61
|
+
report 'null left' if last_key != key
|
62
|
+
null_left key, value
|
63
|
+
left = []
|
64
|
+
else
|
65
|
+
report 'left and right' if last_side == 0
|
66
|
+
output key, left, value
|
67
|
+
end
|
48
68
|
end
|
49
69
|
|
50
|
-
|
70
|
+
last_side = side
|
51
71
|
last_key = key
|
52
72
|
end
|
53
73
|
|
54
|
-
|
74
|
+
# if the last processed record is on the left, there is
|
75
|
+
# not going to be a right side
|
76
|
+
if last_side == 0
|
77
|
+
report 'null right'
|
78
|
+
null_right(key, left)
|
79
|
+
end
|
55
80
|
end
|
56
81
|
end
|
@@ -1,39 +1,17 @@
|
|
1
1
|
require 'streaming_join/join'
|
2
2
|
|
3
3
|
class LeftOuterJoin < Join
|
4
|
-
def
|
5
|
-
|
6
|
-
|
7
|
-
left,right = @join
|
8
|
-
if not left
|
9
|
-
report 'null left'
|
10
|
-
return
|
11
|
-
else
|
12
|
-
report 'left keys'
|
13
|
-
end
|
14
|
-
|
15
|
-
# the number of columns on the right can be passed in as an env variable
|
16
|
-
# so that the full record with empty "" values can be displayed even if
|
17
|
-
# there is no match
|
18
|
-
cols_r = ENV['streaming_join_cols_right'].to_i
|
4
|
+
def null_right key, left
|
5
|
+
o = "#{key}#{@sep_out}"
|
19
6
|
|
20
7
|
left.each do |l|
|
21
|
-
if
|
22
|
-
|
23
|
-
right.each do |r|
|
24
|
-
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
25
|
-
block_given? ? (yield o) : (puts o)
|
26
|
-
end
|
27
|
-
elsif cols_r > 0
|
28
|
-
report 'null right'
|
29
|
-
o = "#{key}#{@sep_out}"
|
30
|
-
o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
|
31
|
-
block_given? ? (yield o) : (puts o)
|
8
|
+
if @cols_r > 0
|
9
|
+
o << "#{l}#{@sep_out}#{Array.new(@cols_r).join(@sep_out)}"
|
32
10
|
else
|
33
|
-
report 'null right'
|
34
11
|
o = "#{key}#{@sep_out}#{l}"
|
35
|
-
block_given? ? (yield o) : (puts o)
|
36
12
|
end
|
37
13
|
end
|
14
|
+
|
15
|
+
block_given? ? (yield o) : (puts o)
|
38
16
|
end
|
39
17
|
end
|
@@ -1,39 +1,15 @@
|
|
1
1
|
require 'streaming_join/join'
|
2
2
|
|
3
3
|
class RightOuterJoin < Join
|
4
|
-
def
|
5
|
-
|
4
|
+
def null_left key, right
|
5
|
+
o = "#{key}#{@sep_out}"
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
report 'null right'
|
10
|
-
return
|
7
|
+
if @cols_l > 0
|
8
|
+
o << "#{Array.new(@cols_l).join(@sep_out)}#{@sep_out}#{right}"
|
11
9
|
else
|
12
|
-
|
10
|
+
o = "#{key}#{@sep_out}#{right}"
|
13
11
|
end
|
14
12
|
|
15
|
-
|
16
|
-
# so that the full record with empty "" values can be displayed even if
|
17
|
-
# there is no match
|
18
|
-
cols_l = ENV['streaming_join_cols_left'].to_i
|
19
|
-
|
20
|
-
right.each do |r|
|
21
|
-
if left
|
22
|
-
report 'left and right'
|
23
|
-
left.each do |l|
|
24
|
-
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
25
|
-
block_given? ? (yield o) : (puts o)
|
26
|
-
end
|
27
|
-
elsif cols_l > 0
|
28
|
-
report 'null left'
|
29
|
-
o = "#{key}#{@sep_out}"
|
30
|
-
o << "#{Array.new(cols_l).join(@sep_out)}#{@sep_out}#{r}"
|
31
|
-
block_given? ? (yield o) : (puts o)
|
32
|
-
else
|
33
|
-
report 'null left'
|
34
|
-
o = "#{key}#{@sep_out}#{r}"
|
35
|
-
block_given? ? (yield o) : (puts o)
|
36
|
-
end
|
37
|
-
end
|
13
|
+
block_given? ? (yield o) : (puts o)
|
38
14
|
end
|
39
15
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
7
|
+
- 3
|
8
8
|
- 0
|
9
|
-
version: 0.
|
9
|
+
version: 0.3.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Frank Fejes
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-08-
|
17
|
+
date: 2011-08-27 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|