streaming_join 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -1
- data/lib/streaming_join/join_mapper.rb +71 -0
- data/lib/streaming_join.rb +1 -0
- metadata +6 -5
data/CHANGELOG
CHANGED
@@ -1,2 +1,7 @@
|
|
1
|
-
* 2011-08-
|
1
|
+
* 2011-08-22 - fsfiii
|
2
|
+
- new: add JoinMapper class, intended to be used as the mapper portion
|
3
|
+
- new: added examples/job_full which runs an entire job with both map and
|
4
|
+
reduce sides using the framework
|
5
|
+
|
6
|
+
* 2011-08-19 - fsfiii
|
2
7
|
- initial import
|
@@ -0,0 +1,71 @@
|
|
1
|
+
class JoinMapper
|
2
|
+
def initialize
|
3
|
+
# use our own input field separator variable since the stock
|
4
|
+
# stream variable can't handle control characters
|
5
|
+
@sep_in = ENV['streaming_join_input_field_separator'] || "\t"
|
6
|
+
@sep_in = $1.hex.chr if @sep_in =~ /\A(?:\\u?)?(\d+)\Z/
|
7
|
+
@sep_out = ENV['stream_map_output_field_separator'] || "\t"
|
8
|
+
@sep_out = $1.hex.chr if @sep_in =~ /\A(?:\\u?)?(\d+)\Z/
|
9
|
+
|
10
|
+
@join = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def report detail
|
14
|
+
STDERR.puts "reporter:counter:join,#{detail},1"
|
15
|
+
end
|
16
|
+
|
17
|
+
def add_side(file_re, *columns, &filter)
|
18
|
+
h = {
|
19
|
+
file_re: file_re,
|
20
|
+
columns: columns,
|
21
|
+
filter: filter,
|
22
|
+
sep: @sep_in,
|
23
|
+
side: @join.size
|
24
|
+
}
|
25
|
+
@join << h
|
26
|
+
h
|
27
|
+
end
|
28
|
+
|
29
|
+
def add_opts(file_re, opts)
|
30
|
+
@join.each do |j|
|
31
|
+
next if j[:file_re] != file_re
|
32
|
+
j.merge! opts
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def join_side
|
37
|
+
input_file = ENV['map_input_file']
|
38
|
+
@join.each do |j|
|
39
|
+
return j if input_file =~ j[:file_re]
|
40
|
+
end
|
41
|
+
raise "how do I handle input file '#{input_file}'?"
|
42
|
+
end
|
43
|
+
|
44
|
+
def process_stream(input = STDIN)
|
45
|
+
last_key = key = nil
|
46
|
+
|
47
|
+
j = join_side
|
48
|
+
cols = j[:columns]
|
49
|
+
filter = j[:filter]
|
50
|
+
side = j[:side]
|
51
|
+
sep = j[:sep]
|
52
|
+
|
53
|
+
input.each do |line|
|
54
|
+
fields = line.chomp.split(sep, -1)
|
55
|
+
|
56
|
+
c = []
|
57
|
+
cols.each_with_index do |col,i|
|
58
|
+
value = fields[col]
|
59
|
+
break if i == 0 and value.nil? # can't have nil key
|
60
|
+
c << value
|
61
|
+
end
|
62
|
+
next if c.empty?
|
63
|
+
|
64
|
+
next if filter and not filter.call(c)
|
65
|
+
|
66
|
+
o = "#{c[0]}#{@sep_out}#{side}#{@sep_out}"
|
67
|
+
o << c[1...c.length].join(@sep_out)
|
68
|
+
puts o
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
data/lib/streaming_join.rb
CHANGED
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
7
|
+
- 2
|
8
8
|
- 0
|
9
|
-
version: 0.
|
9
|
+
version: 0.2.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Frank Fejes
|
@@ -14,11 +14,11 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-08-
|
17
|
+
date: 2011-08-22 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|
21
|
-
description: Classes to process various joins in Hadoop Streaming.
|
21
|
+
description: Classes to process various joins in Hadoop Map/Reduce Streaming.
|
22
22
|
email: frank@fejes.net
|
23
23
|
executables: []
|
24
24
|
|
@@ -36,6 +36,7 @@ files:
|
|
36
36
|
- lib/streaming_join/full_outer_join.rb
|
37
37
|
- lib/streaming_join/cross_join.rb
|
38
38
|
- lib/streaming_join/merge_rows.rb
|
39
|
+
- lib/streaming_join/join_mapper.rb
|
39
40
|
has_rdoc: true
|
40
41
|
homepage: https://github.com/fsfiii/streaming_join
|
41
42
|
licenses: []
|
@@ -67,6 +68,6 @@ rubyforge_project: streaming_join
|
|
67
68
|
rubygems_version: 1.3.7
|
68
69
|
signing_key:
|
69
70
|
specification_version: 3
|
70
|
-
summary: Classes to process joins in Hadoop Streaming
|
71
|
+
summary: Classes to process joins in Hadoop Map/Reduce Streaming.
|
71
72
|
test_files: []
|
72
73
|
|