streaming_join 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -1
- data/lib/streaming_join/join_mapper.rb +71 -0
- data/lib/streaming_join.rb +1 -0
- metadata +6 -5
data/CHANGELOG
CHANGED
@@ -1,2 +1,7 @@
|
|
1
|
-
* 2011-08-
|
1
|
+
* 2011-08-22 - fsfiii
|
2
|
+
- new: add JoinMapper class, intended to be used as the mapper portion
|
3
|
+
- new: added examples/job_full which runs an entire job with both map and
|
4
|
+
reduce sides using the framework
|
5
|
+
|
6
|
+
* 2011-08-19 - fsfiii
|
2
7
|
- initial import
|
@@ -0,0 +1,71 @@
|
|
1
|
+
class JoinMapper
|
2
|
+
def initialize
|
3
|
+
# use our own input field separator variable since the stock
|
4
|
+
# stream variable can't handle control characters
|
5
|
+
@sep_in = ENV['streaming_join_input_field_separator'] || "\t"
|
6
|
+
@sep_in = $1.hex.chr if @sep_in =~ /\A(?:\\u?)?(\d+)\Z/
|
7
|
+
@sep_out = ENV['stream_map_output_field_separator'] || "\t"
|
8
|
+
@sep_out = $1.hex.chr if @sep_in =~ /\A(?:\\u?)?(\d+)\Z/
|
9
|
+
|
10
|
+
@join = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def report detail
|
14
|
+
STDERR.puts "reporter:counter:join,#{detail},1"
|
15
|
+
end
|
16
|
+
|
17
|
+
def add_side(file_re, *columns, &filter)
|
18
|
+
h = {
|
19
|
+
file_re: file_re,
|
20
|
+
columns: columns,
|
21
|
+
filter: filter,
|
22
|
+
sep: @sep_in,
|
23
|
+
side: @join.size
|
24
|
+
}
|
25
|
+
@join << h
|
26
|
+
h
|
27
|
+
end
|
28
|
+
|
29
|
+
def add_opts(file_re, opts)
|
30
|
+
@join.each do |j|
|
31
|
+
next if j[:file_re] != file_re
|
32
|
+
j.merge! opts
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def join_side
|
37
|
+
input_file = ENV['map_input_file']
|
38
|
+
@join.each do |j|
|
39
|
+
return j if input_file =~ j[:file_re]
|
40
|
+
end
|
41
|
+
raise "how do I handle input file '#{input_file}'?"
|
42
|
+
end
|
43
|
+
|
44
|
+
def process_stream(input = STDIN)
|
45
|
+
last_key = key = nil
|
46
|
+
|
47
|
+
j = join_side
|
48
|
+
cols = j[:columns]
|
49
|
+
filter = j[:filter]
|
50
|
+
side = j[:side]
|
51
|
+
sep = j[:sep]
|
52
|
+
|
53
|
+
input.each do |line|
|
54
|
+
fields = line.chomp.split(sep, -1)
|
55
|
+
|
56
|
+
c = []
|
57
|
+
cols.each_with_index do |col,i|
|
58
|
+
value = fields[col]
|
59
|
+
break if i == 0 and value.nil? # can't have nil key
|
60
|
+
c << value
|
61
|
+
end
|
62
|
+
next if c.empty?
|
63
|
+
|
64
|
+
next if filter and not filter.call(c)
|
65
|
+
|
66
|
+
o = "#{c[0]}#{@sep_out}#{side}#{@sep_out}"
|
67
|
+
o << c[1...c.length].join(@sep_out)
|
68
|
+
puts o
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
data/lib/streaming_join.rb
CHANGED
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
7
|
+
- 2
|
8
8
|
- 0
|
9
|
-
version: 0.
|
9
|
+
version: 0.2.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Frank Fejes
|
@@ -14,11 +14,11 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-08-
|
17
|
+
date: 2011-08-22 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|
21
|
-
description: Classes to process various joins in Hadoop Streaming.
|
21
|
+
description: Classes to process various joins in Hadoop Map/Reduce Streaming.
|
22
22
|
email: frank@fejes.net
|
23
23
|
executables: []
|
24
24
|
|
@@ -36,6 +36,7 @@ files:
|
|
36
36
|
- lib/streaming_join/full_outer_join.rb
|
37
37
|
- lib/streaming_join/cross_join.rb
|
38
38
|
- lib/streaming_join/merge_rows.rb
|
39
|
+
- lib/streaming_join/join_mapper.rb
|
39
40
|
has_rdoc: true
|
40
41
|
homepage: https://github.com/fsfiii/streaming_join
|
41
42
|
licenses: []
|
@@ -67,6 +68,6 @@ rubyforge_project: streaming_join
|
|
67
68
|
rubygems_version: 1.3.7
|
68
69
|
signing_key:
|
69
70
|
specification_version: 3
|
70
|
-
summary: Classes to process joins in Hadoop Streaming
|
71
|
+
summary: Classes to process joins in Hadoop Map/Reduce Streaming.
|
71
72
|
test_files: []
|
72
73
|
|