streaming_join 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,2 +1,7 @@
1
- * 2011-08-19 - fsf
1
+ * 2011-08-22 - fsfiii
2
+ - new: add JoinMapper class, intended to be used as the mapper portion
3
+ - new: added examples/job_full which runs an entire job with both map and
4
+ reduce sides using the framework
5
+
6
+ * 2011-08-19 - fsfiii
2
7
  - initial import
@@ -0,0 +1,71 @@
1
+ class JoinMapper
2
+ def initialize
3
+ # use our own input field separator variable since the stock
4
+ # stream variable can't handle control characters
5
+ @sep_in = ENV['streaming_join_input_field_separator'] || "\t"
6
+ @sep_in = $1.hex.chr if @sep_in =~ /\A(?:\\u?)?(\d+)\Z/
7
+ @sep_out = ENV['stream_map_output_field_separator'] || "\t"
8
+ @sep_out = $1.hex.chr if @sep_in =~ /\A(?:\\u?)?(\d+)\Z/
9
+
10
+ @join = []
11
+ end
12
+
13
+ def report detail
14
+ STDERR.puts "reporter:counter:join,#{detail},1"
15
+ end
16
+
17
+ def add_side(file_re, *columns, &filter)
18
+ h = {
19
+ file_re: file_re,
20
+ columns: columns,
21
+ filter: filter,
22
+ sep: @sep_in,
23
+ side: @join.size
24
+ }
25
+ @join << h
26
+ h
27
+ end
28
+
29
+ def add_opts(file_re, opts)
30
+ @join.each do |j|
31
+ next if j[:file_re] != file_re
32
+ j.merge! opts
33
+ end
34
+ end
35
+
36
+ def join_side
37
+ input_file = ENV['map_input_file']
38
+ @join.each do |j|
39
+ return j if input_file =~ j[:file_re]
40
+ end
41
+ raise "how do I handle input file '#{input_file}'?"
42
+ end
43
+
44
+ def process_stream(input = STDIN)
45
+ last_key = key = nil
46
+
47
+ j = join_side
48
+ cols = j[:columns]
49
+ filter = j[:filter]
50
+ side = j[:side]
51
+ sep = j[:sep]
52
+
53
+ input.each do |line|
54
+ fields = line.chomp.split(sep, -1)
55
+
56
+ c = []
57
+ cols.each_with_index do |col,i|
58
+ value = fields[col]
59
+ break if i == 0 and value.nil? # can't have nil key
60
+ c << value
61
+ end
62
+ next if c.empty?
63
+
64
+ next if filter and not filter.call(c)
65
+
66
+ o = "#{c[0]}#{@sep_out}#{side}#{@sep_out}"
67
+ o << c[1...c.length].join(@sep_out)
68
+ puts o
69
+ end
70
+ end
71
+ end
@@ -4,3 +4,4 @@ require 'streaming_join/right_outer_join.rb'
4
4
  require 'streaming_join/full_outer_join.rb'
5
5
  require 'streaming_join/cross_join.rb'
6
6
  require 'streaming_join/merge_rows.rb'
7
+ require 'streaming_join/join_mapper.rb'
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
7
+ - 2
8
8
  - 0
9
- version: 0.1.0
9
+ version: 0.2.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Frank Fejes
@@ -14,11 +14,11 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-08-19 00:00:00 -05:00
17
+ date: 2011-08-22 00:00:00 -05:00
18
18
  default_executable:
19
19
  dependencies: []
20
20
 
21
- description: Classes to process various joins in Hadoop Streaming.
21
+ description: Classes to process various joins in Hadoop Map/Reduce Streaming.
22
22
  email: frank@fejes.net
23
23
  executables: []
24
24
 
@@ -36,6 +36,7 @@ files:
36
36
  - lib/streaming_join/full_outer_join.rb
37
37
  - lib/streaming_join/cross_join.rb
38
38
  - lib/streaming_join/merge_rows.rb
39
+ - lib/streaming_join/join_mapper.rb
39
40
  has_rdoc: true
40
41
  homepage: https://github.com/fsfiii/streaming_join
41
42
  licenses: []
@@ -67,6 +68,6 @@ rubyforge_project: streaming_join
67
68
  rubygems_version: 1.3.7
68
69
  signing_key:
69
70
  specification_version: 3
70
- summary: Classes to process joins in Hadoop Streaming
71
+ summary: Classes to process joins in Hadoop Map/Reduce Streaming.
71
72
  test_files: []
72
73