hadoop-rubydsl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +53 -0
- data/Rakefile +18 -0
- data/TODO +2 -0
- data/VERSION +1 -0
- data/bin/hadoop +276 -0
- data/bin/hadoop-ruby.sh +30 -0
- data/conf/hadoop-site.xml +19 -0
- data/examples/apachelog-v2-2.rb +18 -0
- data/examples/apachelog-v2.rb +25 -0
- data/examples/apachelog.rb +15 -0
- data/examples/hive_like_test.rb +14 -0
- data/examples/word_count_test.rb +7 -0
- data/hadoop-rubydsl.gemspec +79 -0
- data/lib/core.rb +108 -0
- data/lib/hive_like.rb +122 -0
- data/lib/init.rb +60 -0
- data/lib/java/.gitignore +1 -0
- data/lib/java/hadoop-ruby.jar +0 -0
- data/lib/log_analysis.rb +165 -0
- data/lib/mapred_factory.rb +43 -0
- data/lib/util.rb +11 -0
- data/lib/word_count.rb +76 -0
- data/spec/core_spec.rb +73 -0
- data/spec/example_spec.rb +82 -0
- data/spec/hive_like_spec.rb +58 -0
- data/spec/init_spec.rb +56 -0
- data/spec/log_analysis_spec.rb +119 -0
- data/spec/mapred_factory_spec.rb +42 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/util_spec.rb +15 -0
- data/spec/word_count_spec.rb +89 -0
- metadata +100 -0
    
        data/lib/hive_like.rb
    ADDED
    
    | @@ -0,0 +1,122 @@ | |
| 1 | 
            +
            require 'core'
         | 
| 2 | 
            +
            require 'enumerator'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module HadoopDsl::HiveLike
         | 
| 5 | 
            +
              include HadoopDsl
         | 
| 6 | 
            +
              
         | 
| 7 | 
            +
              AVAILABLE_METHODS = [:select, :create_table, :table]
         | 
| 8 | 
            +
             | 
| 9 | 
            +
              # common
         | 
| 10 | 
            +
              module HiveLikeMapRed
         | 
| 11 | 
            +
                def pre_process(body)
         | 
| 12 | 
            +
                  processed = ""
         | 
| 13 | 
            +
                  body.each do |line|
         | 
| 14 | 
            +
                    next if line =~ /^#/
         | 
| 15 | 
            +
                    if line =~ /^(\w*)\s+(.*);$/
         | 
| 16 | 
            +
                      method = $1
         | 
| 17 | 
            +
                      args = sprit_and_marge_args($2)
         | 
| 18 | 
            +
                      processed << "#{method}(#{args})\n"
         | 
| 19 | 
            +
                    else 
         | 
| 20 | 
            +
                      processed << line + "\n"
         | 
| 21 | 
            +
                    end
         | 
| 22 | 
            +
                  end
         | 
| 23 | 
            +
                  processed
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                def sprit_and_marge_args(raw)
         | 
| 27 | 
            +
                  raw.gsub(/[\(\)]/, ' ').split.map do |s|
         | 
| 28 | 
            +
                    stripped = s.gsub(/[\s,"']/, '')
         | 
| 29 | 
            +
                    %Q!"#{stripped}"!
         | 
| 30 | 
            +
                  end.join(", ")
         | 
| 31 | 
            +
                end
         | 
| 32 | 
            +
              end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
              # controller
         | 
| 35 | 
            +
              class HiveLikeSetup < BaseSetup
         | 
| 36 | 
            +
                def load_data(inputs, table)
         | 
| 37 | 
            +
                  @from = inputs
         | 
| 38 | 
            +
                  @to = inputs.gsub(/#{File.basename(inputs)}$/, 'outputs')
         | 
| 39 | 
            +
                end
         | 
| 40 | 
            +
                
         | 
| 41 | 
            +
                def output_format
         | 
| 42 | 
            +
                  @conf.output_key_class = Text
         | 
| 43 | 
            +
                  @conf.output_value_class = Text
         | 
| 44 | 
            +
                end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                # might not need but occur error if not exists
         | 
| 47 | 
            +
                def select(*args) end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                include HiveLikeMapRed
         | 
| 50 | 
            +
              end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
              class HiveLikeMapper < BaseMapper
         | 
| 53 | 
            +
                def initialize(script, key, value)
         | 
| 54 | 
            +
                  super(script, HiveLikeMapperModel.new(key, value))
         | 
| 55 | 
            +
                end
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                include HiveLikeMapRed
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                # model methods
         | 
| 60 | 
            +
                def_delegators :@model, *AVAILABLE_METHODS
         | 
| 61 | 
            +
              end
         | 
| 62 | 
            +
             | 
| 63 | 
            +
              class HiveLikeReducer < BaseReducer
         | 
| 64 | 
            +
                def initialize(script, key, values)
         | 
| 65 | 
            +
                  super(script, HiveLikeReducerModel.new(key, values))
         | 
| 66 | 
            +
                end
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                include HiveLikeMapRed
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                # model methods
         | 
| 71 | 
            +
                def_delegators :@model, *AVAILABLE_METHODS
         | 
| 72 | 
            +
              end
         | 
| 73 | 
            +
             | 
| 74 | 
            +
              # model
         | 
| 75 | 
            +
              class HiveLikeMapperModel < BaseMapperModel
         | 
| 76 | 
            +
                attr_reader :table
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                def initialize(key, value)
         | 
| 79 | 
            +
                  super(key, value)
         | 
| 80 | 
            +
                end
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                # emitters
         | 
| 83 | 
            +
                def create_table(name, *column_and_type)
         | 
| 84 | 
            +
                  @table = Table.new(name)
         | 
| 85 | 
            +
                  column_and_type.each_with_index do |column, index|
         | 
| 86 | 
            +
                    next if index % 2 != 0 # type
         | 
| 87 | 
            +
                    @table.columns << column_and_type[index]
         | 
| 88 | 
            +
                  end
         | 
| 89 | 
            +
                end
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                def select(*args)
         | 
| 92 | 
            +
                  from_index = args.index('from')
         | 
| 93 | 
            +
                  if from_index
         | 
| 94 | 
            +
                    values = args[0...from_index].map do |column|
         | 
| 95 | 
            +
                      splitted = @value.split(/[,\s]+/)
         | 
| 96 | 
            +
                      splitted[@table.columns.index(column)]
         | 
| 97 | 
            +
                    end
         | 
| 98 | 
            +
                    @controller.emit(args[from_index + 1] => values.join(", "))
         | 
| 99 | 
            +
                  end
         | 
| 100 | 
            +
                end
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                class Table
         | 
| 103 | 
            +
                  attr_reader :name, :columns
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                  def initialize(name)
         | 
| 106 | 
            +
                    @name = name
         | 
| 107 | 
            +
                    @columns = []
         | 
| 108 | 
            +
                  end
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                  def column(index) @columns[index] end
         | 
| 111 | 
            +
                end
         | 
| 112 | 
            +
              end
         | 
| 113 | 
            +
             | 
| 114 | 
            +
              class HiveLikeReducerModel < BaseReducerModel
         | 
| 115 | 
            +
                def initialize(key, values)
         | 
| 116 | 
            +
                  super(key, values)
         | 
| 117 | 
            +
                end
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                # emitters
         | 
| 120 | 
            +
                def select(*args) identity end
         | 
| 121 | 
            +
              end
         | 
| 122 | 
            +
            end
         | 
    
        data/lib/init.rb
    ADDED
    
    | @@ -0,0 +1,60 @@ | |
| 1 | 
            +
            require 'core'
         | 
| 2 | 
            +
            require 'java'
         | 
| 3 | 
            +
            require 'mapred_factory'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            import 'org.apache.hadoop.io.IntWritable'
         | 
| 6 | 
            +
            import 'org.apache.hadoop.io.Text'
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            include HadoopDsl
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            # Hadoop IO types
         | 
| 11 | 
            +
            HadoopDsl::Text = Text
         | 
| 12 | 
            +
            HadoopDsl::IntWritable = IntWritable
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            def map(key, value, output, reporter, script)
         | 
| 15 | 
            +
              mapper = MapperFactory.create(script, key.to_string, value.to_string)
         | 
| 16 | 
            +
              mapper.run
         | 
| 17 | 
            +
             | 
| 18 | 
            +
              write(output, mapper)
         | 
| 19 | 
            +
            end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            def reduce(key, values, output, reporter, script)
         | 
| 22 | 
            +
              ruby_values = values.map {|v| to_ruby(v)}
         | 
| 23 | 
            +
              reducer = ReducerFactory.create(script, key.to_string, ruby_values)
         | 
| 24 | 
            +
              reducer.run
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              write(output, reducer)
         | 
| 27 | 
            +
            end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            def setup(conf, script)
         | 
| 30 | 
            +
              setup = SetupFactory.create(script, conf)
         | 
| 31 | 
            +
              setup.run
         | 
| 32 | 
            +
             | 
| 33 | 
            +
              setup.paths.to_java
         | 
| 34 | 
            +
            end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            private
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            def write(output, controller)
         | 
| 39 | 
            +
              controller.emitted.each do |e|
         | 
| 40 | 
            +
                e.each do |k, v|
         | 
| 41 | 
            +
                  output.collect(to_hadoop(k), to_hadoop(v))
         | 
| 42 | 
            +
                end
         | 
| 43 | 
            +
              end
         | 
| 44 | 
            +
            end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            def to_ruby(value)
         | 
| 47 | 
            +
              case value
         | 
| 48 | 
            +
              when IntWritable then value.get
         | 
| 49 | 
            +
              when Text then value.to_string
         | 
| 50 | 
            +
              else raise "no match class: #{value.class}"
         | 
| 51 | 
            +
              end
         | 
| 52 | 
            +
            end
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            def to_hadoop(value)
         | 
| 55 | 
            +
              case value
         | 
| 56 | 
            +
              when Integer then IntWritable.new(value)
         | 
| 57 | 
            +
              when String then t = Text.new; t.set(value); t
         | 
| 58 | 
            +
              else raise "no match class: #{value.class}"
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
            end
         | 
    
        data/lib/java/.gitignore
    ADDED
    
    | @@ -0,0 +1 @@ | |
| 1 | 
            +
            jruby-complete-*.jar
         | 
| Binary file | 
    
        data/lib/log_analysis.rb
    ADDED
    
    | @@ -0,0 +1,165 @@ | |
| 1 | 
            +
            require 'core'
         | 
| 2 | 
            +
            require 'enumerator'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module HadoopDsl::LogAnalysis
         | 
| 5 | 
            +
              include HadoopDsl
         | 
| 6 | 
            +
              
         | 
| 7 | 
            +
              KEY_SEP = "\t"
         | 
| 8 | 
            +
              PREFIX = 'col'
         | 
| 9 | 
            +
              PASS = nil
         | 
| 10 | 
            +
              AVAILABLE_METHODS = [:separate, :pattern, :column_name, :column, :topic, :value, :count_uniq, :sum]
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              # common
         | 
| 13 | 
            +
              module LogAnalysisMapRed
         | 
| 14 | 
            +
                # entry point
         | 
| 15 | 
            +
                def data(description = '', &block) yield end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                def each_line(&block) yield end
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
              # controller
         | 
| 21 | 
            +
              class LogAnalysisSetup < BaseSetup
         | 
| 22 | 
            +
                def initialize(script, conf)
         | 
| 23 | 
            +
                  super(script, conf)
         | 
| 24 | 
            +
                end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                include LogAnalysisMapRed
         | 
| 27 | 
            +
              end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
              class LogAnalysisMapper < BaseMapper
         | 
| 30 | 
            +
                def initialize(script, key, value)
         | 
| 31 | 
            +
                  super(script, LogAnalysisMapperModel.new(key, value))
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                include LogAnalysisMapRed
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                # model methods
         | 
| 37 | 
            +
                def_delegators :@model, *AVAILABLE_METHODS
         | 
| 38 | 
            +
              end
         | 
| 39 | 
            +
             | 
| 40 | 
            +
              class LogAnalysisReducer < BaseReducer
         | 
| 41 | 
            +
                def initialize(script, key, values)
         | 
| 42 | 
            +
                  super(script, LogAnalysisReducerModel.new(key, values))
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                include LogAnalysisMapRed
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                # model methods
         | 
| 48 | 
            +
                def_delegators :@model, *AVAILABLE_METHODS
         | 
| 49 | 
            +
              end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
              # model
         | 
| 52 | 
            +
              class LogAnalysisMapperModel < BaseMapperModel
         | 
| 53 | 
            +
                def initialize(key, value)
         | 
| 54 | 
            +
                  super(key, value)
         | 
| 55 | 
            +
                  @columns = ColumnArray.new
         | 
| 56 | 
            +
                  @topics = []
         | 
| 57 | 
            +
                end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                def column; @columns end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                def topic(desc, options = {}, &block)
         | 
| 62 | 
            +
                  @topics << @current_topic = Topic.new(desc, options[:label])
         | 
| 63 | 
            +
                  yield if block_given?
         | 
| 64 | 
            +
                  @current_topic
         | 
| 65 | 
            +
                end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                def separate(sep)
         | 
| 68 | 
            +
                  parts = @value.split(sep)
         | 
| 69 | 
            +
                  create_or_replace_columns_with(parts) {|column, value| column.value = value}
         | 
| 70 | 
            +
                end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                def pattern(re)
         | 
| 73 | 
            +
                  if @value =~ re
         | 
| 74 | 
            +
                    md = Regexp.last_match
         | 
| 75 | 
            +
                    create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
         | 
| 76 | 
            +
                  end
         | 
| 77 | 
            +
                end
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                # column names by String converted to Symbol
         | 
| 80 | 
            +
                def column_name(*names)
         | 
| 81 | 
            +
                  sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
         | 
| 82 | 
            +
                  create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
         | 
| 83 | 
            +
                end
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                def create_or_replace_columns_with(array, &block)
         | 
| 86 | 
            +
                  columns = array.enum_for(:each_with_index).map do |p, i|
         | 
| 87 | 
            +
                    c = @columns[i] ? @columns[i] : Column.new(i)
         | 
| 88 | 
            +
                    yield c, p
         | 
| 89 | 
            +
                    c
         | 
| 90 | 
            +
                  end
         | 
| 91 | 
            +
                  @columns = ColumnArray.new(columns)
         | 
| 92 | 
            +
                end
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                # emitters
         | 
| 95 | 
            +
                def count_uniq(column)
         | 
| 96 | 
            +
                  @controller.emit([@current_topic.label, KEY_SEP, column.value].join => 1)
         | 
| 97 | 
            +
                end
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                def sum(column)
         | 
| 100 | 
            +
                  @controller.emit([@current_topic.label].join => column.value.to_i)
         | 
| 101 | 
            +
                end
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                class ColumnArray < Array
         | 
| 104 | 
            +
                  def [](key)
         | 
| 105 | 
            +
                    case key
         | 
| 106 | 
            +
                    when Integer then at(key)
         | 
| 107 | 
            +
                    when Symbol then (select {|c| c.name == key}).first
         | 
| 108 | 
            +
                    when String then (select {|c| c.name == key.to_sym}).first
         | 
| 109 | 
            +
                    end
         | 
| 110 | 
            +
                  end
         | 
| 111 | 
            +
                end
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                class Column
         | 
| 114 | 
            +
                  attr_reader :index
         | 
| 115 | 
            +
                  attr_accessor :value, :name
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                  def initialize(index, value = nil)
         | 
| 118 | 
            +
                    @index, @value = index, value
         | 
| 119 | 
            +
                  end
         | 
| 120 | 
            +
                end
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                class Topic
         | 
| 123 | 
            +
                  def initialize(desc, label = nil)
         | 
| 124 | 
            +
                    @desc, @label = desc, label
         | 
| 125 | 
            +
                  end
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                  def label
         | 
| 128 | 
            +
                    @label || @desc.gsub(/\s/, '_')
         | 
| 129 | 
            +
                  end
         | 
| 130 | 
            +
                end
         | 
| 131 | 
            +
              end
         | 
| 132 | 
            +
             | 
| 133 | 
            +
              class LogAnalysisReducerModel < BaseReducerModel
         | 
| 134 | 
            +
                def initialize(key, values)
         | 
| 135 | 
            +
                  super(key, values)
         | 
| 136 | 
            +
                  if key =~ /(\w*)#{KEY_SEP}?(.*)/
         | 
| 137 | 
            +
                    @topic = Topic.new($1, values)
         | 
| 138 | 
            +
                  end
         | 
| 139 | 
            +
                end
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                def topic(desc, options = {}, &block)
         | 
| 142 | 
            +
                  @current_topic = Topic.new(options[:label] || desc.gsub(/\s/, '_'), nil)
         | 
| 143 | 
            +
                  yield if block_given?
         | 
| 144 | 
            +
                  @current_topic
         | 
| 145 | 
            +
                end
         | 
| 146 | 
            +
             | 
| 147 | 
            +
                def count_uniq(column)
         | 
| 148 | 
            +
                  aggregate if @topic == @current_topic
         | 
| 149 | 
            +
                end
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                def sum(column)
         | 
| 152 | 
            +
                  aggregate if @topic == @current_topic
         | 
| 153 | 
            +
                end
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                class Topic
         | 
| 156 | 
            +
                  attr_reader :label, :values
         | 
| 157 | 
            +
                  
         | 
| 158 | 
            +
                  def initialize(label, values)
         | 
| 159 | 
            +
                    @label, @values = label, values
         | 
| 160 | 
            +
                  end
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                  def ==(rh) self.label == rh.label end
         | 
| 163 | 
            +
                end
         | 
| 164 | 
            +
              end
         | 
| 165 | 
            +
            end
         | 
| @@ -0,0 +1,43 @@ | |
| 1 | 
            +
            require 'util'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module HadoopDsl
         | 
| 4 | 
            +
              class MapRedFactory
         | 
| 5 | 
            +
                def self.dsl_name(script)
         | 
| 6 | 
            +
                  read_file(script).each_line do |line|
         | 
| 7 | 
            +
                    dsl_name = $1 if line =~ /^use\s*'(\w*)'/
         | 
| 8 | 
            +
                    return dsl_name
         | 
| 9 | 
            +
                  end
         | 
| 10 | 
            +
                end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                def self.require_dsl_lib(dsl_name)
         | 
| 13 | 
            +
                  require snake_case(dsl_name)
         | 
| 14 | 
            +
                end
         | 
| 15 | 
            +
              end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              class MapperFactory < MapRedFactory
         | 
| 18 | 
            +
                def self.create(script, key, value)
         | 
| 19 | 
            +
                  dsl_name = self.dsl_name(script)
         | 
| 20 | 
            +
                  require_dsl_lib(dsl_name)
         | 
| 21 | 
            +
                  mapper_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Mapper" 
         | 
| 22 | 
            +
                  return eval(mapper_class).new(script, key, value)
         | 
| 23 | 
            +
                end
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              class ReducerFactory < MapRedFactory
         | 
| 27 | 
            +
                def self.create(script, key, values)
         | 
| 28 | 
            +
                  dsl_name = self.dsl_name(script)
         | 
| 29 | 
            +
                  require_dsl_lib(dsl_name)
         | 
| 30 | 
            +
                  reducer_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Reducer" 
         | 
| 31 | 
            +
                  return eval(reducer_class).new(script, key, values)
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
              end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
              class SetupFactory < MapRedFactory
         | 
| 36 | 
            +
                def self.create(script, conf)
         | 
| 37 | 
            +
                  dsl_name = self.dsl_name(script)
         | 
| 38 | 
            +
                  require_dsl_lib(dsl_name)
         | 
| 39 | 
            +
                  setup_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Setup" 
         | 
| 40 | 
            +
                  eval(setup_class).new(script, conf) rescue HadoopDsl::BaseSetup.new(script, conf)
         | 
| 41 | 
            +
                end
         | 
| 42 | 
            +
              end
         | 
| 43 | 
            +
            end
         | 
    
        data/lib/util.rb
    ADDED
    
    
    
        data/lib/word_count.rb
    ADDED
    
    | @@ -0,0 +1,76 @@ | |
| 1 | 
            +
            require 'core'
         | 
| 2 | 
            +
            require 'enumerator'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            module HadoopDsl::WordCount
         | 
| 5 | 
            +
              include HadoopDsl
         | 
| 6 | 
            +
              
         | 
| 7 | 
            +
              AVAILABLE_METHODS = [:count_uniq, :total]
         | 
| 8 | 
            +
              TOTAL_PREFIX = "\t"
         | 
| 9 | 
            +
             | 
| 10 | 
            +
              # common
         | 
| 11 | 
            +
              module WordCountMapRed
         | 
| 12 | 
            +
                # entry point
         | 
| 13 | 
            +
                def data(description = '', &block) yield end
         | 
| 14 | 
            +
              end
         | 
| 15 | 
            +
             | 
| 16 | 
            +
              # controller
         | 
| 17 | 
            +
              class WordCountMapper < BaseMapper
         | 
| 18 | 
            +
                def initialize(script, key, value)
         | 
| 19 | 
            +
                  super(script, WordCountMapperModel.new(key, value))
         | 
| 20 | 
            +
                end
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                include WordCountMapRed
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                # model methods
         | 
| 25 | 
            +
                def_delegators :@model, *AVAILABLE_METHODS
         | 
| 26 | 
            +
              end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
              class WordCountReducer < BaseReducer
         | 
| 29 | 
            +
                def initialize(script, key, values)
         | 
| 30 | 
            +
                  super(script, WordCountReducerModel.new(key, values))
         | 
| 31 | 
            +
                end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                include WordCountMapRed
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                # model methods
         | 
| 36 | 
            +
                def_delegators :@model, *AVAILABLE_METHODS
         | 
| 37 | 
            +
              end
         | 
| 38 | 
            +
             | 
| 39 | 
            +
              # model
         | 
| 40 | 
            +
              class WordCountMapperModel < BaseMapperModel
         | 
| 41 | 
            +
                def initialize(key, value)
         | 
| 42 | 
            +
                  super(key, value)
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                # emitters
         | 
| 46 | 
            +
                def count_uniq
         | 
| 47 | 
            +
                  @value.split.each {|word| @controller.emit(word => 1)}
         | 
| 48 | 
            +
                end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                def total(*types)
         | 
| 51 | 
            +
                  types.each do |type|
         | 
| 52 | 
            +
                    case type
         | 
| 53 | 
            +
                    when :bytes
         | 
| 54 | 
            +
                      @controller.emit("#{TOTAL_PREFIX}total bytes" => @value.gsub(/\s/, '').length)
         | 
| 55 | 
            +
                    when :words
         | 
| 56 | 
            +
                      @controller.emit("#{TOTAL_PREFIX}total words" => @value.split.size)
         | 
| 57 | 
            +
                    when :lines
         | 
| 58 | 
            +
                      @controller.emit("#{TOTAL_PREFIX}total lines" => 1)
         | 
| 59 | 
            +
                    end
         | 
| 60 | 
            +
                  end
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
              end
         | 
| 63 | 
            +
             | 
| 64 | 
            +
              class WordCountReducerModel < BaseReducerModel
         | 
| 65 | 
            +
                def initialize(key, values)
         | 
| 66 | 
            +
                  super(key, values)
         | 
| 67 | 
            +
                end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                # emitters
         | 
| 70 | 
            +
                def count_uniq; aggregate unless total_value? end
         | 
| 71 | 
            +
                def total(*types); aggregate if total_value? end
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                private
         | 
| 74 | 
            +
                def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
         | 
| 75 | 
            +
              end
         | 
| 76 | 
            +
            end
         | 
    
        data/spec/core_spec.rb
    ADDED
    
    | @@ -0,0 +1,73 @@ | |
| 1 | 
            +
            require 'init'
         | 
| 2 | 
            +
            require 'core'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            include HadoopDsl
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            describe 'BaseMapRed' do
         | 
| 7 | 
            +
              before(:all) do
         | 
| 8 | 
            +
                @script = create_tmp_script(<<-EOF)
         | 
| 9 | 
            +
            from 'test/inputs'
         | 
| 10 | 
            +
            to 'test/outputs'
         | 
| 11 | 
            +
                EOF
         | 
| 12 | 
            +
              end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
              it 'emit key value' do
         | 
| 15 | 
            +
                mapper = BaseMapper.new(@script, BaseMapperModel.new(nil, nil))
         | 
| 16 | 
            +
                mapper.emit('key' => 'value')
         | 
| 17 | 
            +
                mapper.emitted.should == [{'key' => 'value'}]
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
              it 'can run BaseMapper in minimum' do
         | 
| 21 | 
            +
                model = BaseMapperModel.new('key', 'value')
         | 
| 22 | 
            +
                mapper = BaseMapper.new(@script, model)
         | 
| 23 | 
            +
                mapper.run
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              it 'can run BaseReducer in minimum' do
         | 
| 27 | 
            +
                model = BaseReducerModel.new('key', 'values')
         | 
| 28 | 
            +
                reducer = BaseReducer.new(@script, model)
         | 
| 29 | 
            +
                reducer.run
         | 
| 30 | 
            +
              end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
              it 'can run BaseSetup in minimum' do
         | 
| 33 | 
            +
                setup = BaseSetup.new(@script, nil)
         | 
| 34 | 
            +
                setup.run
         | 
| 35 | 
            +
              end
         | 
| 36 | 
            +
             | 
| 37 | 
            +
              describe BaseMapper do
         | 
| 38 | 
            +
                it 'can emit as identity' do
         | 
| 39 | 
            +
                  model = BaseMapperModel.new('key', 'value')
         | 
| 40 | 
            +
                  mapper = BaseMapper.new(@script, model)
         | 
| 41 | 
            +
                  model.identity
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                  mapper.emitted.should == [{'key' => 'value'}] 
         | 
| 44 | 
            +
                end
         | 
| 45 | 
            +
              end
         | 
| 46 | 
            +
             | 
| 47 | 
            +
              describe BaseReducer do
         | 
| 48 | 
            +
                it 'can emit as aggregate' do
         | 
| 49 | 
            +
                  model = BaseReducerModel.new('key', [1, 2, 3])
         | 
| 50 | 
            +
                  reducer = BaseReducer.new(@script, model)
         | 
| 51 | 
            +
                  model.aggregate
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                  reducer.emitted.should == [{'key' => 6}] 
         | 
| 54 | 
            +
                end
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                it 'can emit as identity' do
         | 
| 57 | 
            +
                  model = BaseReducerModel.new('key', [1, 2, 3])
         | 
| 58 | 
            +
                  reducer = BaseReducer.new(@script, model)
         | 
| 59 | 
            +
                  model.identity
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                  reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}] 
         | 
| 62 | 
            +
                end
         | 
| 63 | 
            +
              end
         | 
| 64 | 
            +
             | 
| 65 | 
            +
              describe BaseSetup do
         | 
| 66 | 
            +
                it 'can get paths' do
         | 
| 67 | 
            +
                  setup = BaseSetup.new(@script, nil)
         | 
| 68 | 
            +
                  setup.run
         | 
| 69 | 
            +
                  setup.paths[0].should == 'test/inputs'
         | 
| 70 | 
            +
                  setup.paths[1].should == 'test/outputs'
         | 
| 71 | 
            +
                end
         | 
| 72 | 
            +
              end
         | 
| 73 | 
            +
            end
         | 
| @@ -0,0 +1,82 @@ | |
| 1 | 
            +
            require 'log_analysis'
         | 
| 2 | 
            +
            require 'word_count'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            include HadoopDsl::LogAnalysis
         | 
| 5 | 
            +
            describe 'Aapach Log Example' do
         | 
| 6 | 
            +
              before(:all) do
         | 
| 7 | 
            +
                @script = File.join(File.dirname(__FILE__), '..', 'examples', 'apachelog-v2.rb')
         | 
| 8 | 
            +
                @value = '127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326'
         | 
| 9 | 
            +
              end
         | 
| 10 | 
            +
             | 
| 11 | 
            +
              it 'can run example by mapper' do
         | 
| 12 | 
            +
                mapper = LogAnalysisMapper.new(@script, nil, @value)
         | 
| 13 | 
            +
                mapper.run
         | 
| 14 | 
            +
                mapper.emitted.first["user\tfrank"].should == 1
         | 
| 15 | 
            +
              end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              it 'can run example by reducer' do
         | 
| 18 | 
            +
                reducer = LogAnalysisReducer.new(@script, "user\tfrank", [1, 1, 1])
         | 
| 19 | 
            +
                reducer.run
         | 
| 20 | 
            +
                reducer.emitted.first["user\tfrank"].should == 3
         | 
| 21 | 
            +
              end
         | 
| 22 | 
            +
            end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            include HadoopDsl::WordCount
         | 
| 25 | 
            +
            describe 'Word Count Example' do
         | 
| 26 | 
            +
              before(:all) do
         | 
| 27 | 
            +
                @script = File.join(File.dirname(__FILE__), '..', 'examples', 'word_count_test.rb')
         | 
| 28 | 
            +
                @value = 'Lorem ipsum ipsum Lorem sit amet,'
         | 
| 29 | 
            +
              end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
              it 'can run example by mapper' do
         | 
| 32 | 
            +
                mapper = WordCountMapper.new(@script, nil, @value)
         | 
| 33 | 
            +
                mapper.run
         | 
| 34 | 
            +
                mapper.emitted.size.should == 9
         | 
| 35 | 
            +
                mapper.emitted.each do |e|
         | 
| 36 | 
            +
                  case e.keys.first
         | 
| 37 | 
            +
                  when 'Lorem'
         | 
| 38 | 
            +
                    e.values.first.should == 1
         | 
| 39 | 
            +
                  when 'total words'
         | 
| 40 | 
            +
                    e.values.first.should == 6
         | 
| 41 | 
            +
                  end
         | 
| 42 | 
            +
                end
         | 
| 43 | 
            +
              end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
              it 'can run example by reducer' do
         | 
| 46 | 
            +
                reducer = WordCountReducer.new(@script, "Lorem", [1, 1, 1])
         | 
| 47 | 
            +
                reducer.run
         | 
| 48 | 
            +
                reducer.emitted.first["Lorem"].should == 3
         | 
| 49 | 
            +
              end
         | 
| 50 | 
            +
            end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            include HadoopDsl::HiveLike
         | 
| 53 | 
            +
            describe 'Hive Like Example' do
         | 
| 54 | 
            +
              before(:all) do
         | 
| 55 | 
            +
                @script = File.join(File.dirname(__FILE__), '..', 'examples', 'hive_like_test.rb')
         | 
| 56 | 
            +
                @value = 'apple, 3, 100'
         | 
| 57 | 
            +
              end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
              it 'can run setup' do
         | 
| 60 | 
            +
                conf = mock('conf')
         | 
| 61 | 
            +
                conf.should_receive(:output_key_class=).once
         | 
| 62 | 
            +
                conf.should_receive(:output_value_class=).once
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                setup = HiveLikeSetup.new(@script, conf)
         | 
| 65 | 
            +
                setup.run
         | 
| 66 | 
            +
                setup.paths[0].should == 'hive-like/items.txt'
         | 
| 67 | 
            +
              end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
              it 'can run example by mapper' do
         | 
| 70 | 
            +
                mapper = HiveLikeMapper.new(@script, nil, @value)
         | 
| 71 | 
            +
                mapper.run
         | 
| 72 | 
            +
                mapper.emitted.size.should == 1
         | 
| 73 | 
            +
                mapper.emitted.first['items'].should == '3, 100, apple'
         | 
| 74 | 
            +
              end
         | 
| 75 | 
            +
             | 
| 76 | 
            +
              it 'can run example by reducer' do
         | 
| 77 | 
            +
                values = ['v1', 'v2', 'v3']
         | 
| 78 | 
            +
                reducer = HiveLikeReducer.new(@script, "items", values)
         | 
| 79 | 
            +
                reducer.run
         | 
| 80 | 
            +
                reducer.emitted.first["items"].should == 'v1'
         | 
| 81 | 
            +
              end
         | 
| 82 | 
            +
            end
         | 
| @@ -0,0 +1,58 @@ | |
| 1 | 
            +
            require 'init'
         | 
| 2 | 
            +
            require 'core'
         | 
| 3 | 
            +
            require 'hive_like'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            include HadoopDsl::HiveLike
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            describe HiveLikeSetup do
         | 
| 8 | 
            +
              it 'should load data' do
         | 
| 9 | 
            +
                script = create_tmp_script(%Q!load_data "hive-like/inputs", items;!)
         | 
| 10 | 
            +
                conf = mock('conf')
         | 
| 11 | 
            +
                conf.should_receive(:output_key_class=).once
         | 
| 12 | 
            +
                conf.should_receive(:output_value_class=).once
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                setup = HiveLikeSetup.new(script, conf)
         | 
| 15 | 
            +
                setup.run
         | 
| 16 | 
            +
                setup.paths[0].should == 'hive-like/inputs'
         | 
| 17 | 
            +
                setup.paths[1].should == 'hive-like/outputs'
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
            end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            describe HiveLikeMapper do
         | 
| 22 | 
            +
              before do
         | 
| 23 | 
            +
                @value = 'apple, 3, 100'
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              it 'should create table' do
         | 
| 27 | 
            +
                mapper = HiveLikeMapper.new(nil, nil, @value)
         | 
| 28 | 
            +
                mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
         | 
| 29 | 
            +
                mapper.table.name.should == 'items'
         | 
| 30 | 
            +
                mapper.table.column(0).should == 'item'
         | 
| 31 | 
            +
                mapper.table.column(1).should == 'quantity'
         | 
| 32 | 
            +
              end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
              it 'should select' do
         | 
| 35 | 
            +
                mapper = HiveLikeMapper.new(nil, nil, @value)
         | 
| 36 | 
            +
                mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
         | 
| 37 | 
            +
                mapper.select("item", "quantity", "price", "from", "items")
         | 
| 38 | 
            +
                mapper.emitted.first.should == {'items' => 'apple, 3, 100'}
         | 
| 39 | 
            +
              end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
              it 'should pre process script body' do
         | 
| 42 | 
            +
                body = "select foo, bar from table;\n"
         | 
| 43 | 
            +
                mapper = HiveLikeMapper.new(nil, nil, @value)
         | 
| 44 | 
            +
                processed = mapper.pre_process(body)
         | 
| 45 | 
            +
                processed.should == %Q!select("foo", "bar", "from", "table")\n!
         | 
| 46 | 
            +
              end
         | 
| 47 | 
            +
            end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            describe HiveLikeReducer do
         | 
| 50 | 
            +
              it 'should select as identity' do
         | 
| 51 | 
            +
                key = 'Lorem'
         | 
| 52 | 
            +
                values = [1, 1, 1]
         | 
| 53 | 
            +
                reducer = HiveLikeReducer.new(nil, key, values)
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                reducer.select
         | 
| 56 | 
            +
                reducer.emitted[0].should == {'Lorem' => 1}
         | 
| 57 | 
            +
              end
         | 
| 58 | 
            +
            end
         |