streaming_join 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,2 +1,7 @@
1
- * 2011-08-19 - fsf
1
+ * 2011-08-22 - fsfiii
2
+ - new: add JoinMapper class, intended to be used as the mapper portion
3
+ - new: added examples/job_full which runs an entire job with both map and
4
+ reduce sides using the framework
5
+
6
+ * 2011-08-19 - fsfiii
2
7
  - initial import
@@ -0,0 +1,71 @@
1
+ class JoinMapper
2
+ def initialize
3
+ # use our own input field separator variable since the stock
4
+ # stream variable can't handle control characters
5
+ @sep_in = ENV['streaming_join_input_field_separator'] || "\t"
6
+ @sep_in = $1.hex.chr if @sep_in =~ /\A(?:\\u?)?(\d+)\Z/
7
+ @sep_out = ENV['stream_map_output_field_separator'] || "\t"
8
+ @sep_out = $1.hex.chr if @sep_in =~ /\A(?:\\u?)?(\d+)\Z/
9
+
10
+ @join = []
11
+ end
12
+
13
+ def report detail
14
+ STDERR.puts "reporter:counter:join,#{detail},1"
15
+ end
16
+
17
+ def add_side(file_re, *columns, &filter)
18
+ h = {
19
+ file_re: file_re,
20
+ columns: columns,
21
+ filter: filter,
22
+ sep: @sep_in,
23
+ side: @join.size
24
+ }
25
+ @join << h
26
+ h
27
+ end
28
+
29
+ def add_opts(file_re, opts)
30
+ @join.each do |j|
31
+ next if j[:file_re] != file_re
32
+ j.merge! opts
33
+ end
34
+ end
35
+
36
+ def join_side
37
+ input_file = ENV['map_input_file']
38
+ @join.each do |j|
39
+ return j if input_file =~ j[:file_re]
40
+ end
41
+ raise "how do I handle input file '#{input_file}'?"
42
+ end
43
+
44
+ def process_stream(input = STDIN)
45
+ last_key = key = nil
46
+
47
+ j = join_side
48
+ cols = j[:columns]
49
+ filter = j[:filter]
50
+ side = j[:side]
51
+ sep = j[:sep]
52
+
53
+ input.each do |line|
54
+ fields = line.chomp.split(sep, -1)
55
+
56
+ c = []
57
+ cols.each_with_index do |col,i|
58
+ value = fields[col]
59
+ break if i == 0 and value.nil? # can't have nil key
60
+ c << value
61
+ end
62
+ next if c.empty?
63
+
64
+ next if filter and not filter.call(c)
65
+
66
+ o = "#{c[0]}#{@sep_out}#{side}#{@sep_out}"
67
+ o << c[1...c.length].join(@sep_out)
68
+ puts o
69
+ end
70
+ end
71
+ end
@@ -4,3 +4,4 @@ require 'streaming_join/right_outer_join.rb'
4
4
  require 'streaming_join/full_outer_join.rb'
5
5
  require 'streaming_join/cross_join.rb'
6
6
  require 'streaming_join/merge_rows.rb'
7
+ require 'streaming_join/join_mapper.rb'
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
7
+ - 2
8
8
  - 0
9
- version: 0.1.0
9
+ version: 0.2.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Frank Fejes
@@ -14,11 +14,11 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-08-19 00:00:00 -05:00
17
+ date: 2011-08-22 00:00:00 -05:00
18
18
  default_executable:
19
19
  dependencies: []
20
20
 
21
- description: Classes to process various joins in Hadoop Streaming.
21
+ description: Classes to process various joins in Hadoop Map/Reduce Streaming.
22
22
  email: frank@fejes.net
23
23
  executables: []
24
24
 
@@ -36,6 +36,7 @@ files:
36
36
  - lib/streaming_join/full_outer_join.rb
37
37
  - lib/streaming_join/cross_join.rb
38
38
  - lib/streaming_join/merge_rows.rb
39
+ - lib/streaming_join/join_mapper.rb
39
40
  has_rdoc: true
40
41
  homepage: https://github.com/fsfiii/streaming_join
41
42
  licenses: []
@@ -67,6 +68,6 @@ rubyforge_project: streaming_join
67
68
  rubygems_version: 1.3.7
68
69
  signing_key:
69
70
  specification_version: 3
70
- summary: Classes to process joins in Hadoop Streaming
71
+ summary: Classes to process joins in Hadoop Map/Reduce Streaming.
71
72
  test_files: []
72
73