streaming_join 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +6 -1
 - data/lib/streaming_join/join_mapper.rb +71 -0
 - data/lib/streaming_join.rb +1 -0
 - metadata +6 -5
 
    
        data/CHANGELOG
    CHANGED
    
    | 
         @@ -1,2 +1,7 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            * 2011-08- 
     | 
| 
      
 1 
     | 
    
         
            +
            * 2011-08-22 - fsfiii
         
     | 
| 
      
 2 
     | 
    
         
            +
            - new: add JoinMapper class, intended to be used as the mapper portion
         
     | 
| 
      
 3 
     | 
    
         
            +
            - new: added examples/job_full which runs an entire job with both map and
         
     | 
| 
      
 4 
     | 
    
         
            +
                   reduce sides using the framework
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            * 2011-08-19 - fsfiii
         
     | 
| 
       2 
7 
     | 
    
         
             
            - initial import
         
     | 
| 
         @@ -0,0 +1,71 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            class JoinMapper
         
     | 
| 
      
 2 
     | 
    
         
            +
              def initialize
         
     | 
| 
      
 3 
     | 
    
         
            +
                # use our own input field separator variable since the stock
         
     | 
| 
      
 4 
     | 
    
         
            +
                # stream variable can't handle control characters
         
     | 
| 
      
 5 
     | 
    
         
            +
                @sep_in  = ENV['streaming_join_input_field_separator'] || "\t"
         
     | 
| 
      
 6 
     | 
    
         
            +
                @sep_in  = $1.hex.chr if @sep_in =~ /\A(?:\\u?)?(\d+)\Z/
         
     | 
| 
      
 7 
     | 
    
         
            +
                @sep_out = ENV['stream_map_output_field_separator'] || "\t"
         
     | 
| 
      
 8 
     | 
    
         
            +
                @sep_out = $1.hex.chr if @sep_in =~ /\A(?:\\u?)?(\d+)\Z/
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
                @join = []
         
     | 
| 
      
 11 
     | 
    
         
            +
              end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
              def report detail
         
     | 
| 
      
 14 
     | 
    
         
            +
                STDERR.puts "reporter:counter:join,#{detail},1"
         
     | 
| 
      
 15 
     | 
    
         
            +
              end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
              def add_side(file_re, *columns, &filter)
         
     | 
| 
      
 18 
     | 
    
         
            +
                h = {
         
     | 
| 
      
 19 
     | 
    
         
            +
                  file_re: file_re,
         
     | 
| 
      
 20 
     | 
    
         
            +
                  columns: columns,
         
     | 
| 
      
 21 
     | 
    
         
            +
                  filter:  filter,
         
     | 
| 
      
 22 
     | 
    
         
            +
                  sep:     @sep_in,
         
     | 
| 
      
 23 
     | 
    
         
            +
                  side:    @join.size
         
     | 
| 
      
 24 
     | 
    
         
            +
                }
         
     | 
| 
      
 25 
     | 
    
         
            +
                @join << h
         
     | 
| 
      
 26 
     | 
    
         
            +
                h
         
     | 
| 
      
 27 
     | 
    
         
            +
              end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
              def add_opts(file_re, opts)
         
     | 
| 
      
 30 
     | 
    
         
            +
                @join.each do |j|
         
     | 
| 
      
 31 
     | 
    
         
            +
                  next if j[:file_re] != file_re
         
     | 
| 
      
 32 
     | 
    
         
            +
                  j.merge! opts
         
     | 
| 
      
 33 
     | 
    
         
            +
                end
         
     | 
| 
      
 34 
     | 
    
         
            +
              end
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
              def join_side
         
     | 
| 
      
 37 
     | 
    
         
            +
                input_file = ENV['map_input_file']
         
     | 
| 
      
 38 
     | 
    
         
            +
                @join.each do |j|
         
     | 
| 
      
 39 
     | 
    
         
            +
                  return j if input_file =~ j[:file_re]
         
     | 
| 
      
 40 
     | 
    
         
            +
                end
         
     | 
| 
      
 41 
     | 
    
         
            +
                raise "how do I handle input file '#{input_file}'?"
         
     | 
| 
      
 42 
     | 
    
         
            +
              end
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
              def process_stream(input = STDIN)
         
     | 
| 
      
 45 
     | 
    
         
            +
                last_key = key = nil
         
     | 
| 
      
 46 
     | 
    
         
            +
             
     | 
| 
      
 47 
     | 
    
         
            +
                j = join_side
         
     | 
| 
      
 48 
     | 
    
         
            +
                cols   = j[:columns]
         
     | 
| 
      
 49 
     | 
    
         
            +
                filter = j[:filter]
         
     | 
| 
      
 50 
     | 
    
         
            +
                side   = j[:side]
         
     | 
| 
      
 51 
     | 
    
         
            +
                sep    = j[:sep]
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
                input.each do |line|
         
     | 
| 
      
 54 
     | 
    
         
            +
                  fields = line.chomp.split(sep, -1)
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
                  c = []
         
     | 
| 
      
 57 
     | 
    
         
            +
                  cols.each_with_index do |col,i|
         
     | 
| 
      
 58 
     | 
    
         
            +
                    value = fields[col]
         
     | 
| 
      
 59 
     | 
    
         
            +
                    break if i == 0 and value.nil? # can't have nil key
         
     | 
| 
      
 60 
     | 
    
         
            +
                    c << value
         
     | 
| 
      
 61 
     | 
    
         
            +
                  end
         
     | 
| 
      
 62 
     | 
    
         
            +
                  next if c.empty?
         
     | 
| 
      
 63 
     | 
    
         
            +
             
     | 
| 
      
 64 
     | 
    
         
            +
                  next if filter and not filter.call(c)
         
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
      
 66 
     | 
    
         
            +
                  o = "#{c[0]}#{@sep_out}#{side}#{@sep_out}"
         
     | 
| 
      
 67 
     | 
    
         
            +
                  o << c[1...c.length].join(@sep_out)
         
     | 
| 
      
 68 
     | 
    
         
            +
                  puts o
         
     | 
| 
      
 69 
     | 
    
         
            +
                end
         
     | 
| 
      
 70 
     | 
    
         
            +
              end
         
     | 
| 
      
 71 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/streaming_join.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | 
         @@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version 
     | 
|
| 
       4 
4 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       5 
5 
     | 
    
         
             
              segments: 
         
     | 
| 
       6 
6 
     | 
    
         
             
              - 0
         
     | 
| 
       7 
     | 
    
         
            -
              -  
     | 
| 
      
 7 
     | 
    
         
            +
              - 2
         
     | 
| 
       8 
8 
     | 
    
         
             
              - 0
         
     | 
| 
       9 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 9 
     | 
    
         
            +
              version: 0.2.0
         
     | 
| 
       10 
10 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       11 
11 
     | 
    
         
             
            authors: 
         
     | 
| 
       12 
12 
     | 
    
         
             
            - Frank Fejes
         
     | 
| 
         @@ -14,11 +14,11 @@ autorequire: 
     | 
|
| 
       14 
14 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       15 
15 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       16 
16 
     | 
    
         | 
| 
       17 
     | 
    
         
            -
            date: 2011-08- 
     | 
| 
      
 17 
     | 
    
         
            +
            date: 2011-08-22 00:00:00 -05:00
         
     | 
| 
       18 
18 
     | 
    
         
             
            default_executable: 
         
     | 
| 
       19 
19 
     | 
    
         
             
            dependencies: []
         
     | 
| 
       20 
20 
     | 
    
         | 
| 
       21 
     | 
    
         
            -
            description: Classes to process various joins in Hadoop Streaming.
         
     | 
| 
      
 21 
     | 
    
         
            +
            description: Classes to process various joins in Hadoop Map/Reduce Streaming.
         
     | 
| 
       22 
22 
     | 
    
         
             
            email: frank@fejes.net
         
     | 
| 
       23 
23 
     | 
    
         
             
            executables: []
         
     | 
| 
       24 
24 
     | 
    
         | 
| 
         @@ -36,6 +36,7 @@ files: 
     | 
|
| 
       36 
36 
     | 
    
         
             
            - lib/streaming_join/full_outer_join.rb
         
     | 
| 
       37 
37 
     | 
    
         
             
            - lib/streaming_join/cross_join.rb
         
     | 
| 
       38 
38 
     | 
    
         
             
            - lib/streaming_join/merge_rows.rb
         
     | 
| 
      
 39 
     | 
    
         
            +
            - lib/streaming_join/join_mapper.rb
         
     | 
| 
       39 
40 
     | 
    
         
             
            has_rdoc: true
         
     | 
| 
       40 
41 
     | 
    
         
             
            homepage: https://github.com/fsfiii/streaming_join
         
     | 
| 
       41 
42 
     | 
    
         
             
            licenses: []
         
     | 
| 
         @@ -67,6 +68,6 @@ rubyforge_project: streaming_join 
     | 
|
| 
       67 
68 
     | 
    
         
             
            rubygems_version: 1.3.7
         
     | 
| 
       68 
69 
     | 
    
         
             
            signing_key: 
         
     | 
| 
       69 
70 
     | 
    
         
             
            specification_version: 3
         
     | 
| 
       70 
     | 
    
         
            -
            summary: Classes to process joins in Hadoop Streaming
         
     | 
| 
      
 71 
     | 
    
         
            +
            summary: Classes to process joins in Hadoop Map/Reduce Streaming.
         
     | 
| 
       71 
72 
     | 
    
         
             
            test_files: []
         
     | 
| 
       72 
73 
     | 
    
         |