streaming_join 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +11 -0
- data/README +58 -0
- data/lib/streaming_join/full_outer_join.rb +17 -47
- data/lib/streaming_join/join.rb +52 -27
- data/lib/streaming_join/left_outer_join.rb +6 -28
- data/lib/streaming_join/right_outer_join.rb +6 -30
- metadata +3 -3
data/CHANGELOG
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
* 2011-08-27 - fsfiii
|
2
|
+
- update: Redesigned join logic so as to not read individual keyspaces
|
3
|
+
into memory since this simply won't scale. Now only the left
|
4
|
+
side of a keyspace will be placed into memory (one keyspace at
|
5
|
+
a time, only) and the right side will be processed as values are
|
6
|
+
read in. This now requires KeyFieldBasedPartioner so that the
|
7
|
+
values are sorted left side first.
|
8
|
+
- clean: removed example/job and renamed example/job_full to example/job
|
9
|
+
- new: add opts passing to Join class
|
10
|
+
- new: add :report opt for Join class to control reporting of key stats
|
11
|
+
|
1
12
|
* 2011-08-22 - fsfiii
|
2
13
|
- new: add JoinMapper class, intended to be used as the mapper portion
|
3
14
|
- new: added examples/job_full which runs an entire job with both map and
|
data/README
CHANGED
@@ -18,6 +18,64 @@ right_outer_join
|
|
18
18
|
full_outer_join
|
19
19
|
merge_rows
|
20
20
|
|
21
|
+
API
|
22
|
+
|
23
|
+
As with most map/reduce jobs, there are two basic components:
|
24
|
+
|
25
|
+
Mapper
|
26
|
+
|
27
|
+
The mapper in a streaming_join job outputs records of the form:
|
28
|
+
|
29
|
+
key TAB side_index TAB value
|
30
|
+
|
31
|
+
The "left" side uses side_index 0 while the right side would use 1.
|
32
|
+
|
33
|
+
In a simple join, here is one example mapper output:
|
34
|
+
|
35
|
+
1 0 Matsumoto
|
36
|
+
2 0 Wall
|
37
|
+
3 0 van Rossum
|
38
|
+
|
39
|
+
Here is the other side's output:
|
40
|
+
|
41
|
+
1 1 Ruby 1995
|
42
|
+
2 1 Perl 1987
|
43
|
+
4 1 Clojure 2007
|
44
|
+
|
45
|
+
Reducer
|
46
|
+
|
47
|
+
Depending on the join style, the reducer will emit combined records. Using
|
48
|
+
the above mapper output, an inner join would end up like:
|
49
|
+
|
50
|
+
1 Matsumoto Ruby
|
51
|
+
2 Wall Perl
|
52
|
+
|
53
|
+
The code would look something like this:
|
54
|
+
|
55
|
+
Mapper
|
56
|
+
|
57
|
+
require 'streaming_join'
|
58
|
+
|
59
|
+
j = JoinMapper.new
|
60
|
+
j.add_side(/_left/, 0, 1)
|
61
|
+
j.add_side(/_right/, 0, 1)
|
62
|
+
j.process_stream
|
63
|
+
|
64
|
+
Reducer
|
65
|
+
|
66
|
+
require 'streaming_join'
|
67
|
+
|
68
|
+
Join.new.process_stream
|
69
|
+
|
70
|
+
See examples/job for more detail and comments.
|
71
|
+
|
72
|
+
Current Limitations
|
73
|
+
|
74
|
+
- As each key is processed in a reducer, the left side of that single
|
75
|
+
keyspace must fit into memory. So, when in doubt, put the smaller
|
76
|
+
table on the left.
|
77
|
+
- Only two tables can be joined in a single job.
|
78
|
+
|
21
79
|
Please let me know if you find this software useful!
|
22
80
|
|
23
81
|
--frank
|
@@ -1,59 +1,29 @@
|
|
1
1
|
require 'streaming_join/join'
|
2
2
|
|
3
3
|
class FullOuterJoin < Join
|
4
|
-
def
|
5
|
-
|
6
|
-
|
7
|
-
left,right = @join
|
8
|
-
if not left
|
9
|
-
report 'null left'
|
10
|
-
else
|
11
|
-
report 'left keys'
|
12
|
-
end
|
13
|
-
if not right
|
14
|
-
report 'null right'
|
15
|
-
else
|
16
|
-
report 'right keys'
|
17
|
-
end
|
18
|
-
|
19
|
-
#p left, right
|
20
|
-
|
21
|
-
# the number of columns on the sides can be passed in as env variables
|
22
|
-
# so that the full record with empty "" values can be displayed even if
|
23
|
-
# there is no match
|
24
|
-
cols_r = ENV['streaming_join_cols_right'].to_i
|
25
|
-
cols_l = ENV['streaming_join_cols_left'].to_i
|
4
|
+
def null_right key, left
|
5
|
+
o = "#{key}#{@sep_out}"
|
26
6
|
|
27
7
|
left.each do |l|
|
28
|
-
if
|
29
|
-
|
30
|
-
right.each do |r|
|
31
|
-
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
32
|
-
block_given? ? (yield o) : (puts o)
|
33
|
-
end
|
34
|
-
elsif cols_r > 0
|
35
|
-
report 'null right'
|
36
|
-
o = "#{key}#{@sep_out}"
|
37
|
-
o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
|
38
|
-
block_given? ? (yield o) : (puts o)
|
8
|
+
if @cols_r > 0
|
9
|
+
o << "#{l}#{@sep_out}#{Array.new(@cols_r).join(@sep_out)}"
|
39
10
|
else
|
40
|
-
report 'null right'
|
41
11
|
o = "#{key}#{@sep_out}#{l}"
|
42
|
-
block_given? ? (yield o) : (puts o)
|
43
12
|
end
|
44
|
-
end
|
13
|
+
end
|
45
14
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
15
|
+
block_given? ? (yield o) : (puts o)
|
16
|
+
end
|
17
|
+
|
18
|
+
def null_left key, right
|
19
|
+
o = "#{key}#{@sep_out}"
|
20
|
+
|
21
|
+
if @cols_l > 0
|
22
|
+
o << "#{Array.new(@cols_l).join(@sep_out)}#{@sep_out}#{right}"
|
23
|
+
else
|
24
|
+
o = "#{key}#{@sep_out}#{right}"
|
25
|
+
end
|
55
26
|
|
56
|
-
|
57
|
-
end if right
|
27
|
+
block_given? ? (yield o) : (puts o)
|
58
28
|
end
|
59
29
|
end
|
data/lib/streaming_join/join.rb
CHANGED
@@ -1,56 +1,81 @@
|
|
1
1
|
# base Join class (inner join)
|
2
2
|
class Join
|
3
|
-
def initialize
|
3
|
+
def initialize opts = {}
|
4
4
|
@sep_in = ENV['stream_map_output_field_separator'] || "\t"
|
5
5
|
@sep_out = ENV['streaming_join_output_separator'] || "\t"
|
6
6
|
@sep_out = $1.hex.chr if @sep_out =~ /\A(?:\\u?)?(\d+)\Z/
|
7
|
+
@report = opts.fetch :report, true
|
8
|
+
@cols_l = ENV['streaming_join_cols_left'].to_i
|
9
|
+
@cols_r = ENV['streaming_join_cols_right'].to_i
|
7
10
|
end
|
8
11
|
|
9
12
|
def report detail
|
13
|
+
return if not @report
|
10
14
|
STDERR.puts "reporter:counter:join,#{detail},1"
|
11
15
|
end
|
12
16
|
|
13
|
-
def output key
|
14
|
-
report 'keys'
|
15
|
-
|
16
|
-
left,right = @join
|
17
|
-
if not left
|
18
|
-
report 'null left'
|
19
|
-
return
|
20
|
-
elsif not right
|
21
|
-
report 'null right'
|
22
|
-
return
|
23
|
-
end
|
24
|
-
|
17
|
+
def output key, left, right
|
25
18
|
left.each do |l|
|
26
|
-
|
27
|
-
|
28
|
-
o
|
29
|
-
|
30
|
-
|
31
|
-
else
|
32
|
-
puts o
|
33
|
-
end
|
19
|
+
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{right}"
|
20
|
+
if block_given?
|
21
|
+
yield o
|
22
|
+
else
|
23
|
+
puts o
|
34
24
|
end
|
35
25
|
end
|
36
26
|
end
|
37
27
|
|
28
|
+
def null_left key, right
|
29
|
+
end
|
30
|
+
|
31
|
+
def null_right key, left
|
32
|
+
end
|
33
|
+
|
38
34
|
def process_stream(input = STDIN, &blk)
|
39
35
|
last_key = key = nil
|
40
|
-
|
36
|
+
last_side = nil
|
37
|
+
left = []
|
41
38
|
|
42
39
|
input.each do |line|
|
43
40
|
key, side, value = line.chomp.split(@sep_in, 3)
|
44
41
|
|
45
|
-
|
46
|
-
|
47
|
-
|
42
|
+
report 'keys' if last_key != key
|
43
|
+
|
44
|
+
side = side.to_i
|
45
|
+
if side == 0
|
46
|
+
# if we are on the left side and just processed the left side
|
47
|
+
# of another key, we didn't get any right side records
|
48
|
+
if last_key != key and last_side == 0
|
49
|
+
report 'null right'
|
50
|
+
null_right last_key, left
|
51
|
+
end
|
52
|
+
|
53
|
+
if last_key != key
|
54
|
+
left = []
|
55
|
+
end
|
56
|
+
left << value
|
57
|
+
else
|
58
|
+
# if we're in a new key and the first record is a right side
|
59
|
+
# record, that means we never processed a left side
|
60
|
+
if not last_key or last_key != key or left.empty?
|
61
|
+
report 'null left' if last_key != key
|
62
|
+
null_left key, value
|
63
|
+
left = []
|
64
|
+
else
|
65
|
+
report 'left and right' if last_side == 0
|
66
|
+
output key, left, value
|
67
|
+
end
|
48
68
|
end
|
49
69
|
|
50
|
-
|
70
|
+
last_side = side
|
51
71
|
last_key = key
|
52
72
|
end
|
53
73
|
|
54
|
-
|
74
|
+
# if the last processed record is on the left, there is
|
75
|
+
# not going to be a right side
|
76
|
+
if last_side == 0
|
77
|
+
report 'null right'
|
78
|
+
null_right(key, left)
|
79
|
+
end
|
55
80
|
end
|
56
81
|
end
|
@@ -1,39 +1,17 @@
|
|
1
1
|
require 'streaming_join/join'
|
2
2
|
|
3
3
|
class LeftOuterJoin < Join
|
4
|
-
def
|
5
|
-
|
6
|
-
|
7
|
-
left,right = @join
|
8
|
-
if not left
|
9
|
-
report 'null left'
|
10
|
-
return
|
11
|
-
else
|
12
|
-
report 'left keys'
|
13
|
-
end
|
14
|
-
|
15
|
-
# the number of columns on the right can be passed in as an env variable
|
16
|
-
# so that the full record with empty "" values can be displayed even if
|
17
|
-
# there is no match
|
18
|
-
cols_r = ENV['streaming_join_cols_right'].to_i
|
4
|
+
def null_right key, left
|
5
|
+
o = "#{key}#{@sep_out}"
|
19
6
|
|
20
7
|
left.each do |l|
|
21
|
-
if
|
22
|
-
|
23
|
-
right.each do |r|
|
24
|
-
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
25
|
-
block_given? ? (yield o) : (puts o)
|
26
|
-
end
|
27
|
-
elsif cols_r > 0
|
28
|
-
report 'null right'
|
29
|
-
o = "#{key}#{@sep_out}"
|
30
|
-
o << "#{l}#{@sep_out}#{Array.new(cols_r).join(@sep_out)}"
|
31
|
-
block_given? ? (yield o) : (puts o)
|
8
|
+
if @cols_r > 0
|
9
|
+
o << "#{l}#{@sep_out}#{Array.new(@cols_r).join(@sep_out)}"
|
32
10
|
else
|
33
|
-
report 'null right'
|
34
11
|
o = "#{key}#{@sep_out}#{l}"
|
35
|
-
block_given? ? (yield o) : (puts o)
|
36
12
|
end
|
37
13
|
end
|
14
|
+
|
15
|
+
block_given? ? (yield o) : (puts o)
|
38
16
|
end
|
39
17
|
end
|
@@ -1,39 +1,15 @@
|
|
1
1
|
require 'streaming_join/join'
|
2
2
|
|
3
3
|
class RightOuterJoin < Join
|
4
|
-
def
|
5
|
-
|
4
|
+
def null_left key, right
|
5
|
+
o = "#{key}#{@sep_out}"
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
report 'null right'
|
10
|
-
return
|
7
|
+
if @cols_l > 0
|
8
|
+
o << "#{Array.new(@cols_l).join(@sep_out)}#{@sep_out}#{right}"
|
11
9
|
else
|
12
|
-
|
10
|
+
o = "#{key}#{@sep_out}#{right}"
|
13
11
|
end
|
14
12
|
|
15
|
-
|
16
|
-
# so that the full record with empty "" values can be displayed even if
|
17
|
-
# there is no match
|
18
|
-
cols_l = ENV['streaming_join_cols_left'].to_i
|
19
|
-
|
20
|
-
right.each do |r|
|
21
|
-
if left
|
22
|
-
report 'left and right'
|
23
|
-
left.each do |l|
|
24
|
-
o = "#{key}#{@sep_out}#{l}#{@sep_out}#{r}"
|
25
|
-
block_given? ? (yield o) : (puts o)
|
26
|
-
end
|
27
|
-
elsif cols_l > 0
|
28
|
-
report 'null left'
|
29
|
-
o = "#{key}#{@sep_out}"
|
30
|
-
o << "#{Array.new(cols_l).join(@sep_out)}#{@sep_out}#{r}"
|
31
|
-
block_given? ? (yield o) : (puts o)
|
32
|
-
else
|
33
|
-
report 'null left'
|
34
|
-
o = "#{key}#{@sep_out}#{r}"
|
35
|
-
block_given? ? (yield o) : (puts o)
|
36
|
-
end
|
37
|
-
end
|
13
|
+
block_given? ? (yield o) : (puts o)
|
38
14
|
end
|
39
15
|
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
7
|
+
- 3
|
8
8
|
- 0
|
9
|
-
version: 0.
|
9
|
+
version: 0.3.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Frank Fejes
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-08-
|
17
|
+
date: 2011-08-27 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|