hadoop-rubydsl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +53 -0
- data/Rakefile +18 -0
- data/TODO +2 -0
- data/VERSION +1 -0
- data/bin/hadoop +276 -0
- data/bin/hadoop-ruby.sh +30 -0
- data/conf/hadoop-site.xml +19 -0
- data/examples/apachelog-v2-2.rb +18 -0
- data/examples/apachelog-v2.rb +25 -0
- data/examples/apachelog.rb +15 -0
- data/examples/hive_like_test.rb +14 -0
- data/examples/word_count_test.rb +7 -0
- data/hadoop-rubydsl.gemspec +79 -0
- data/lib/core.rb +108 -0
- data/lib/hive_like.rb +122 -0
- data/lib/init.rb +60 -0
- data/lib/java/.gitignore +1 -0
- data/lib/java/hadoop-ruby.jar +0 -0
- data/lib/log_analysis.rb +165 -0
- data/lib/mapred_factory.rb +43 -0
- data/lib/util.rb +11 -0
- data/lib/word_count.rb +76 -0
- data/spec/core_spec.rb +73 -0
- data/spec/example_spec.rb +82 -0
- data/spec/hive_like_spec.rb +58 -0
- data/spec/init_spec.rb +56 -0
- data/spec/log_analysis_spec.rb +119 -0
- data/spec/mapred_factory_spec.rb +42 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/util_spec.rb +15 -0
- data/spec/word_count_spec.rb +89 -0
- metadata +100 -0
data/lib/hive_like.rb
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'core'
|
2
|
+
require 'enumerator'
|
3
|
+
|
4
|
+
module HadoopDsl::HiveLike
|
5
|
+
include HadoopDsl
|
6
|
+
|
7
|
+
AVAILABLE_METHODS = [:select, :create_table, :table]
|
8
|
+
|
9
|
+
# common
|
10
|
+
module HiveLikeMapRed
|
11
|
+
def pre_process(body)
|
12
|
+
processed = ""
|
13
|
+
body.each do |line|
|
14
|
+
next if line =~ /^#/
|
15
|
+
if line =~ /^(\w*)\s+(.*);$/
|
16
|
+
method = $1
|
17
|
+
args = sprit_and_marge_args($2)
|
18
|
+
processed << "#{method}(#{args})\n"
|
19
|
+
else
|
20
|
+
processed << line + "\n"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
processed
|
24
|
+
end
|
25
|
+
|
26
|
+
def sprit_and_marge_args(raw)
|
27
|
+
raw.gsub(/[\(\)]/, ' ').split.map do |s|
|
28
|
+
stripped = s.gsub(/[\s,"']/, '')
|
29
|
+
%Q!"#{stripped}"!
|
30
|
+
end.join(", ")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# controller
|
35
|
+
class HiveLikeSetup < BaseSetup
|
36
|
+
def load_data(inputs, table)
|
37
|
+
@from = inputs
|
38
|
+
@to = inputs.gsub(/#{File.basename(inputs)}$/, 'outputs')
|
39
|
+
end
|
40
|
+
|
41
|
+
def output_format
|
42
|
+
@conf.output_key_class = Text
|
43
|
+
@conf.output_value_class = Text
|
44
|
+
end
|
45
|
+
|
46
|
+
# might not need but occur error if not exists
|
47
|
+
def select(*args) end
|
48
|
+
|
49
|
+
include HiveLikeMapRed
|
50
|
+
end
|
51
|
+
|
52
|
+
class HiveLikeMapper < BaseMapper
|
53
|
+
def initialize(script, key, value)
|
54
|
+
super(script, HiveLikeMapperModel.new(key, value))
|
55
|
+
end
|
56
|
+
|
57
|
+
include HiveLikeMapRed
|
58
|
+
|
59
|
+
# model methods
|
60
|
+
def_delegators :@model, *AVAILABLE_METHODS
|
61
|
+
end
|
62
|
+
|
63
|
+
class HiveLikeReducer < BaseReducer
|
64
|
+
def initialize(script, key, values)
|
65
|
+
super(script, HiveLikeReducerModel.new(key, values))
|
66
|
+
end
|
67
|
+
|
68
|
+
include HiveLikeMapRed
|
69
|
+
|
70
|
+
# model methods
|
71
|
+
def_delegators :@model, *AVAILABLE_METHODS
|
72
|
+
end
|
73
|
+
|
74
|
+
# model
|
75
|
+
class HiveLikeMapperModel < BaseMapperModel
|
76
|
+
attr_reader :table
|
77
|
+
|
78
|
+
def initialize(key, value)
|
79
|
+
super(key, value)
|
80
|
+
end
|
81
|
+
|
82
|
+
# emitters
|
83
|
+
def create_table(name, *column_and_type)
|
84
|
+
@table = Table.new(name)
|
85
|
+
column_and_type.each_with_index do |column, index|
|
86
|
+
next if index % 2 != 0 # type
|
87
|
+
@table.columns << column_and_type[index]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def select(*args)
|
92
|
+
from_index = args.index('from')
|
93
|
+
if from_index
|
94
|
+
values = args[0...from_index].map do |column|
|
95
|
+
splitted = @value.split(/[,\s]+/)
|
96
|
+
splitted[@table.columns.index(column)]
|
97
|
+
end
|
98
|
+
@controller.emit(args[from_index + 1] => values.join(", "))
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
class Table
|
103
|
+
attr_reader :name, :columns
|
104
|
+
|
105
|
+
def initialize(name)
|
106
|
+
@name = name
|
107
|
+
@columns = []
|
108
|
+
end
|
109
|
+
|
110
|
+
def column(index) @columns[index] end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
class HiveLikeReducerModel < BaseReducerModel
|
115
|
+
def initialize(key, values)
|
116
|
+
super(key, values)
|
117
|
+
end
|
118
|
+
|
119
|
+
# emitters
|
120
|
+
def select(*args) identity end
|
121
|
+
end
|
122
|
+
end
|
data/lib/init.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'core'
|
2
|
+
require 'java'
|
3
|
+
require 'mapred_factory'
|
4
|
+
|
5
|
+
import 'org.apache.hadoop.io.IntWritable'
|
6
|
+
import 'org.apache.hadoop.io.Text'
|
7
|
+
|
8
|
+
include HadoopDsl
|
9
|
+
|
10
|
+
# Hadoop IO types
|
11
|
+
HadoopDsl::Text = Text
|
12
|
+
HadoopDsl::IntWritable = IntWritable
|
13
|
+
|
14
|
+
def map(key, value, output, reporter, script)
|
15
|
+
mapper = MapperFactory.create(script, key.to_string, value.to_string)
|
16
|
+
mapper.run
|
17
|
+
|
18
|
+
write(output, mapper)
|
19
|
+
end
|
20
|
+
|
21
|
+
def reduce(key, values, output, reporter, script)
|
22
|
+
ruby_values = values.map {|v| to_ruby(v)}
|
23
|
+
reducer = ReducerFactory.create(script, key.to_string, ruby_values)
|
24
|
+
reducer.run
|
25
|
+
|
26
|
+
write(output, reducer)
|
27
|
+
end
|
28
|
+
|
29
|
+
def setup(conf, script)
|
30
|
+
setup = SetupFactory.create(script, conf)
|
31
|
+
setup.run
|
32
|
+
|
33
|
+
setup.paths.to_java
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def write(output, controller)
|
39
|
+
controller.emitted.each do |e|
|
40
|
+
e.each do |k, v|
|
41
|
+
output.collect(to_hadoop(k), to_hadoop(v))
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def to_ruby(value)
|
47
|
+
case value
|
48
|
+
when IntWritable then value.get
|
49
|
+
when Text then value.to_string
|
50
|
+
else raise "no match class: #{value.class}"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def to_hadoop(value)
|
55
|
+
case value
|
56
|
+
when Integer then IntWritable.new(value)
|
57
|
+
when String then t = Text.new; t.set(value); t
|
58
|
+
else raise "no match class: #{value.class}"
|
59
|
+
end
|
60
|
+
end
|
data/lib/java/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
jruby-complete-*.jar
|
Binary file
|
data/lib/log_analysis.rb
ADDED
@@ -0,0 +1,165 @@
|
|
1
|
+
require 'core'
|
2
|
+
require 'enumerator'
|
3
|
+
|
4
|
+
module HadoopDsl::LogAnalysis
|
5
|
+
include HadoopDsl
|
6
|
+
|
7
|
+
KEY_SEP = "\t"
|
8
|
+
PREFIX = 'col'
|
9
|
+
PASS = nil
|
10
|
+
AVAILABLE_METHODS = [:separate, :pattern, :column_name, :column, :topic, :value, :count_uniq, :sum]
|
11
|
+
|
12
|
+
# common
|
13
|
+
module LogAnalysisMapRed
|
14
|
+
# entry point
|
15
|
+
def data(description = '', &block) yield end
|
16
|
+
|
17
|
+
def each_line(&block) yield end
|
18
|
+
end
|
19
|
+
|
20
|
+
# controller
|
21
|
+
class LogAnalysisSetup < BaseSetup
|
22
|
+
def initialize(script, conf)
|
23
|
+
super(script, conf)
|
24
|
+
end
|
25
|
+
|
26
|
+
include LogAnalysisMapRed
|
27
|
+
end
|
28
|
+
|
29
|
+
class LogAnalysisMapper < BaseMapper
|
30
|
+
def initialize(script, key, value)
|
31
|
+
super(script, LogAnalysisMapperModel.new(key, value))
|
32
|
+
end
|
33
|
+
|
34
|
+
include LogAnalysisMapRed
|
35
|
+
|
36
|
+
# model methods
|
37
|
+
def_delegators :@model, *AVAILABLE_METHODS
|
38
|
+
end
|
39
|
+
|
40
|
+
class LogAnalysisReducer < BaseReducer
|
41
|
+
def initialize(script, key, values)
|
42
|
+
super(script, LogAnalysisReducerModel.new(key, values))
|
43
|
+
end
|
44
|
+
|
45
|
+
include LogAnalysisMapRed
|
46
|
+
|
47
|
+
# model methods
|
48
|
+
def_delegators :@model, *AVAILABLE_METHODS
|
49
|
+
end
|
50
|
+
|
51
|
+
# model
|
52
|
+
class LogAnalysisMapperModel < BaseMapperModel
|
53
|
+
def initialize(key, value)
|
54
|
+
super(key, value)
|
55
|
+
@columns = ColumnArray.new
|
56
|
+
@topics = []
|
57
|
+
end
|
58
|
+
|
59
|
+
def column; @columns end
|
60
|
+
|
61
|
+
def topic(desc, options = {}, &block)
|
62
|
+
@topics << @current_topic = Topic.new(desc, options[:label])
|
63
|
+
yield if block_given?
|
64
|
+
@current_topic
|
65
|
+
end
|
66
|
+
|
67
|
+
def separate(sep)
|
68
|
+
parts = @value.split(sep)
|
69
|
+
create_or_replace_columns_with(parts) {|column, value| column.value = value}
|
70
|
+
end
|
71
|
+
|
72
|
+
def pattern(re)
|
73
|
+
if @value =~ re
|
74
|
+
md = Regexp.last_match
|
75
|
+
create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# column names by String converted to Symbol
|
80
|
+
def column_name(*names)
|
81
|
+
sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
|
82
|
+
create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
|
83
|
+
end
|
84
|
+
|
85
|
+
def create_or_replace_columns_with(array, &block)
|
86
|
+
columns = array.enum_for(:each_with_index).map do |p, i|
|
87
|
+
c = @columns[i] ? @columns[i] : Column.new(i)
|
88
|
+
yield c, p
|
89
|
+
c
|
90
|
+
end
|
91
|
+
@columns = ColumnArray.new(columns)
|
92
|
+
end
|
93
|
+
|
94
|
+
# emitters
|
95
|
+
def count_uniq(column)
|
96
|
+
@controller.emit([@current_topic.label, KEY_SEP, column.value].join => 1)
|
97
|
+
end
|
98
|
+
|
99
|
+
def sum(column)
|
100
|
+
@controller.emit([@current_topic.label].join => column.value.to_i)
|
101
|
+
end
|
102
|
+
|
103
|
+
class ColumnArray < Array
|
104
|
+
def [](key)
|
105
|
+
case key
|
106
|
+
when Integer then at(key)
|
107
|
+
when Symbol then (select {|c| c.name == key}).first
|
108
|
+
when String then (select {|c| c.name == key.to_sym}).first
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
class Column
|
114
|
+
attr_reader :index
|
115
|
+
attr_accessor :value, :name
|
116
|
+
|
117
|
+
def initialize(index, value = nil)
|
118
|
+
@index, @value = index, value
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
class Topic
|
123
|
+
def initialize(desc, label = nil)
|
124
|
+
@desc, @label = desc, label
|
125
|
+
end
|
126
|
+
|
127
|
+
def label
|
128
|
+
@label || @desc.gsub(/\s/, '_')
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
class LogAnalysisReducerModel < BaseReducerModel
|
134
|
+
def initialize(key, values)
|
135
|
+
super(key, values)
|
136
|
+
if key =~ /(\w*)#{KEY_SEP}?(.*)/
|
137
|
+
@topic = Topic.new($1, values)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def topic(desc, options = {}, &block)
|
142
|
+
@current_topic = Topic.new(options[:label] || desc.gsub(/\s/, '_'), nil)
|
143
|
+
yield if block_given?
|
144
|
+
@current_topic
|
145
|
+
end
|
146
|
+
|
147
|
+
def count_uniq(column)
|
148
|
+
aggregate if @topic == @current_topic
|
149
|
+
end
|
150
|
+
|
151
|
+
def sum(column)
|
152
|
+
aggregate if @topic == @current_topic
|
153
|
+
end
|
154
|
+
|
155
|
+
class Topic
|
156
|
+
attr_reader :label, :values
|
157
|
+
|
158
|
+
def initialize(label, values)
|
159
|
+
@label, @values = label, values
|
160
|
+
end
|
161
|
+
|
162
|
+
def ==(rh) self.label == rh.label end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'util'
|
2
|
+
|
3
|
+
module HadoopDsl
|
4
|
+
class MapRedFactory
|
5
|
+
def self.dsl_name(script)
|
6
|
+
read_file(script).each_line do |line|
|
7
|
+
dsl_name = $1 if line =~ /^use\s*'(\w*)'/
|
8
|
+
return dsl_name
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.require_dsl_lib(dsl_name)
|
13
|
+
require snake_case(dsl_name)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class MapperFactory < MapRedFactory
|
18
|
+
def self.create(script, key, value)
|
19
|
+
dsl_name = self.dsl_name(script)
|
20
|
+
require_dsl_lib(dsl_name)
|
21
|
+
mapper_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Mapper"
|
22
|
+
return eval(mapper_class).new(script, key, value)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class ReducerFactory < MapRedFactory
|
27
|
+
def self.create(script, key, values)
|
28
|
+
dsl_name = self.dsl_name(script)
|
29
|
+
require_dsl_lib(dsl_name)
|
30
|
+
reducer_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Reducer"
|
31
|
+
return eval(reducer_class).new(script, key, values)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class SetupFactory < MapRedFactory
|
36
|
+
def self.create(script, conf)
|
37
|
+
dsl_name = self.dsl_name(script)
|
38
|
+
require_dsl_lib(dsl_name)
|
39
|
+
setup_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Setup"
|
40
|
+
eval(setup_class).new(script, conf) rescue HadoopDsl::BaseSetup.new(script, conf)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/lib/util.rb
ADDED
data/lib/word_count.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'core'
|
2
|
+
require 'enumerator'
|
3
|
+
|
4
|
+
module HadoopDsl::WordCount
|
5
|
+
include HadoopDsl
|
6
|
+
|
7
|
+
AVAILABLE_METHODS = [:count_uniq, :total]
|
8
|
+
TOTAL_PREFIX = "\t"
|
9
|
+
|
10
|
+
# common
|
11
|
+
module WordCountMapRed
|
12
|
+
# entry point
|
13
|
+
def data(description = '', &block) yield end
|
14
|
+
end
|
15
|
+
|
16
|
+
# controller
|
17
|
+
class WordCountMapper < BaseMapper
|
18
|
+
def initialize(script, key, value)
|
19
|
+
super(script, WordCountMapperModel.new(key, value))
|
20
|
+
end
|
21
|
+
|
22
|
+
include WordCountMapRed
|
23
|
+
|
24
|
+
# model methods
|
25
|
+
def_delegators :@model, *AVAILABLE_METHODS
|
26
|
+
end
|
27
|
+
|
28
|
+
class WordCountReducer < BaseReducer
|
29
|
+
def initialize(script, key, values)
|
30
|
+
super(script, WordCountReducerModel.new(key, values))
|
31
|
+
end
|
32
|
+
|
33
|
+
include WordCountMapRed
|
34
|
+
|
35
|
+
# model methods
|
36
|
+
def_delegators :@model, *AVAILABLE_METHODS
|
37
|
+
end
|
38
|
+
|
39
|
+
# model
|
40
|
+
class WordCountMapperModel < BaseMapperModel
|
41
|
+
def initialize(key, value)
|
42
|
+
super(key, value)
|
43
|
+
end
|
44
|
+
|
45
|
+
# emitters
|
46
|
+
def count_uniq
|
47
|
+
@value.split.each {|word| @controller.emit(word => 1)}
|
48
|
+
end
|
49
|
+
|
50
|
+
def total(*types)
|
51
|
+
types.each do |type|
|
52
|
+
case type
|
53
|
+
when :bytes
|
54
|
+
@controller.emit("#{TOTAL_PREFIX}total bytes" => @value.gsub(/\s/, '').length)
|
55
|
+
when :words
|
56
|
+
@controller.emit("#{TOTAL_PREFIX}total words" => @value.split.size)
|
57
|
+
when :lines
|
58
|
+
@controller.emit("#{TOTAL_PREFIX}total lines" => 1)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class WordCountReducerModel < BaseReducerModel
|
65
|
+
def initialize(key, values)
|
66
|
+
super(key, values)
|
67
|
+
end
|
68
|
+
|
69
|
+
# emitters
|
70
|
+
def count_uniq; aggregate unless total_value? end
|
71
|
+
def total(*types); aggregate if total_value? end
|
72
|
+
|
73
|
+
private
|
74
|
+
def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
|
75
|
+
end
|
76
|
+
end
|
data/spec/core_spec.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'init'
|
2
|
+
require 'core'
|
3
|
+
|
4
|
+
include HadoopDsl
|
5
|
+
|
6
|
+
describe 'BaseMapRed' do
|
7
|
+
before(:all) do
|
8
|
+
@script = create_tmp_script(<<-EOF)
|
9
|
+
from 'test/inputs'
|
10
|
+
to 'test/outputs'
|
11
|
+
EOF
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'emit key value' do
|
15
|
+
mapper = BaseMapper.new(@script, BaseMapperModel.new(nil, nil))
|
16
|
+
mapper.emit('key' => 'value')
|
17
|
+
mapper.emitted.should == [{'key' => 'value'}]
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'can run BaseMapper in minimum' do
|
21
|
+
model = BaseMapperModel.new('key', 'value')
|
22
|
+
mapper = BaseMapper.new(@script, model)
|
23
|
+
mapper.run
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'can run BaseReducer in minimum' do
|
27
|
+
model = BaseReducerModel.new('key', 'values')
|
28
|
+
reducer = BaseReducer.new(@script, model)
|
29
|
+
reducer.run
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'can run BaseSetup in minimum' do
|
33
|
+
setup = BaseSetup.new(@script, nil)
|
34
|
+
setup.run
|
35
|
+
end
|
36
|
+
|
37
|
+
describe BaseMapper do
|
38
|
+
it 'can emit as identity' do
|
39
|
+
model = BaseMapperModel.new('key', 'value')
|
40
|
+
mapper = BaseMapper.new(@script, model)
|
41
|
+
model.identity
|
42
|
+
|
43
|
+
mapper.emitted.should == [{'key' => 'value'}]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe BaseReducer do
|
48
|
+
it 'can emit as aggregate' do
|
49
|
+
model = BaseReducerModel.new('key', [1, 2, 3])
|
50
|
+
reducer = BaseReducer.new(@script, model)
|
51
|
+
model.aggregate
|
52
|
+
|
53
|
+
reducer.emitted.should == [{'key' => 6}]
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'can emit as identity' do
|
57
|
+
model = BaseReducerModel.new('key', [1, 2, 3])
|
58
|
+
reducer = BaseReducer.new(@script, model)
|
59
|
+
model.identity
|
60
|
+
|
61
|
+
reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe BaseSetup do
|
66
|
+
it 'can get paths' do
|
67
|
+
setup = BaseSetup.new(@script, nil)
|
68
|
+
setup.run
|
69
|
+
setup.paths[0].should == 'test/inputs'
|
70
|
+
setup.paths[1].should == 'test/outputs'
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'log_analysis'
|
2
|
+
require 'word_count'
|
3
|
+
|
4
|
+
include HadoopDsl::LogAnalysis
|
5
|
+
describe 'Aapach Log Example' do
|
6
|
+
before(:all) do
|
7
|
+
@script = File.join(File.dirname(__FILE__), '..', 'examples', 'apachelog-v2.rb')
|
8
|
+
@value = '127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326'
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'can run example by mapper' do
|
12
|
+
mapper = LogAnalysisMapper.new(@script, nil, @value)
|
13
|
+
mapper.run
|
14
|
+
mapper.emitted.first["user\tfrank"].should == 1
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'can run example by reducer' do
|
18
|
+
reducer = LogAnalysisReducer.new(@script, "user\tfrank", [1, 1, 1])
|
19
|
+
reducer.run
|
20
|
+
reducer.emitted.first["user\tfrank"].should == 3
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
include HadoopDsl::WordCount
|
25
|
+
describe 'Word Count Example' do
|
26
|
+
before(:all) do
|
27
|
+
@script = File.join(File.dirname(__FILE__), '..', 'examples', 'word_count_test.rb')
|
28
|
+
@value = 'Lorem ipsum ipsum Lorem sit amet,'
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'can run example by mapper' do
|
32
|
+
mapper = WordCountMapper.new(@script, nil, @value)
|
33
|
+
mapper.run
|
34
|
+
mapper.emitted.size.should == 9
|
35
|
+
mapper.emitted.each do |e|
|
36
|
+
case e.keys.first
|
37
|
+
when 'Lorem'
|
38
|
+
e.values.first.should == 1
|
39
|
+
when 'total words'
|
40
|
+
e.values.first.should == 6
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'can run example by reducer' do
|
46
|
+
reducer = WordCountReducer.new(@script, "Lorem", [1, 1, 1])
|
47
|
+
reducer.run
|
48
|
+
reducer.emitted.first["Lorem"].should == 3
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
include HadoopDsl::HiveLike
|
53
|
+
describe 'Hive Like Example' do
|
54
|
+
before(:all) do
|
55
|
+
@script = File.join(File.dirname(__FILE__), '..', 'examples', 'hive_like_test.rb')
|
56
|
+
@value = 'apple, 3, 100'
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'can run setup' do
|
60
|
+
conf = mock('conf')
|
61
|
+
conf.should_receive(:output_key_class=).once
|
62
|
+
conf.should_receive(:output_value_class=).once
|
63
|
+
|
64
|
+
setup = HiveLikeSetup.new(@script, conf)
|
65
|
+
setup.run
|
66
|
+
setup.paths[0].should == 'hive-like/items.txt'
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'can run example by mapper' do
|
70
|
+
mapper = HiveLikeMapper.new(@script, nil, @value)
|
71
|
+
mapper.run
|
72
|
+
mapper.emitted.size.should == 1
|
73
|
+
mapper.emitted.first['items'].should == '3, 100, apple'
|
74
|
+
end
|
75
|
+
|
76
|
+
it 'can run example by reducer' do
|
77
|
+
values = ['v1', 'v2', 'v3']
|
78
|
+
reducer = HiveLikeReducer.new(@script, "items", values)
|
79
|
+
reducer.run
|
80
|
+
reducer.emitted.first["items"].should == 'v1'
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'init'
|
2
|
+
require 'core'
|
3
|
+
require 'hive_like'
|
4
|
+
|
5
|
+
include HadoopDsl::HiveLike
|
6
|
+
|
7
|
+
describe HiveLikeSetup do
|
8
|
+
it 'should load data' do
|
9
|
+
script = create_tmp_script(%Q!load_data "hive-like/inputs", items;!)
|
10
|
+
conf = mock('conf')
|
11
|
+
conf.should_receive(:output_key_class=).once
|
12
|
+
conf.should_receive(:output_value_class=).once
|
13
|
+
|
14
|
+
setup = HiveLikeSetup.new(script, conf)
|
15
|
+
setup.run
|
16
|
+
setup.paths[0].should == 'hive-like/inputs'
|
17
|
+
setup.paths[1].should == 'hive-like/outputs'
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe HiveLikeMapper do
|
22
|
+
before do
|
23
|
+
@value = 'apple, 3, 100'
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should create table' do
|
27
|
+
mapper = HiveLikeMapper.new(nil, nil, @value)
|
28
|
+
mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
|
29
|
+
mapper.table.name.should == 'items'
|
30
|
+
mapper.table.column(0).should == 'item'
|
31
|
+
mapper.table.column(1).should == 'quantity'
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should select' do
|
35
|
+
mapper = HiveLikeMapper.new(nil, nil, @value)
|
36
|
+
mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
|
37
|
+
mapper.select("item", "quantity", "price", "from", "items")
|
38
|
+
mapper.emitted.first.should == {'items' => 'apple, 3, 100'}
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'should pre process script body' do
|
42
|
+
body = "select foo, bar from table;\n"
|
43
|
+
mapper = HiveLikeMapper.new(nil, nil, @value)
|
44
|
+
processed = mapper.pre_process(body)
|
45
|
+
processed.should == %Q!select("foo", "bar", "from", "table")\n!
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
describe HiveLikeReducer do
|
50
|
+
it 'should select as identity' do
|
51
|
+
key = 'Lorem'
|
52
|
+
values = [1, 1, 1]
|
53
|
+
reducer = HiveLikeReducer.new(nil, key, values)
|
54
|
+
|
55
|
+
reducer.select
|
56
|
+
reducer.emitted[0].should == {'Lorem' => 1}
|
57
|
+
end
|
58
|
+
end
|