hadoop-papyrus 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/README.rdoc +58 -0
- data/Rakefile +18 -0
- data/VERSION +1 -0
- data/bin/papyrus +5 -0
- data/conf/hadoop-site.xml +19 -0
- data/contrib/hudson/hadoop-hudson.sh +276 -0
- data/contrib/hudson/hadoop-ruby.sh +30 -0
- data/examples/hive_like_test.rb +14 -0
- data/examples/log_analysis_test.rb +43 -0
- data/examples/word_count_test.rb +7 -0
- data/hadoop-papyrus.gemspec +77 -0
- data/lib/core.rb +106 -0
- data/lib/dsl_init.rb +33 -0
- data/lib/hadoop_dsl.rb +14 -0
- data/lib/hadoop_dsl_client.rb +37 -0
- data/lib/hive_like.rb +106 -0
- data/lib/log_analysis.rb +213 -0
- data/lib/mapred_factory.rb +54 -0
- data/lib/util.rb +38 -0
- data/lib/word_count.rb +56 -0
- data/spec/client_spec.rb +27 -0
- data/spec/core_spec.rb +73 -0
- data/spec/dsl_init_spec.rb +49 -0
- data/spec/example_spec.rb +84 -0
- data/spec/hive_like_spec.rb +57 -0
- data/spec/log_analysis_spec.rb +184 -0
- data/spec/mapred_factory_spec.rb +74 -0
- data/spec/spec_helper.rb +10 -0
- data/spec/util_spec.rb +34 -0
- data/spec/word_count_spec.rb +88 -0
- metadata +96 -0
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'hadoop_dsl'
|
2
|
+
|
3
|
+
module HadoopDsl
|
4
|
+
class MapRedFactory
|
5
|
+
def self.dsl_name(script)
|
6
|
+
HadoopDsl.read_file(script).each_line do |line|
|
7
|
+
dsl_name = $1 if line =~ /\s*dsl\s*\(?["'](\w*)["']\)?/
|
8
|
+
return dsl_name if dsl_name
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.require_dsl_lib(dsl_name)
|
13
|
+
require HadoopDsl.snake_case(dsl_name)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class MapperFactory < MapRedFactory
|
18
|
+
# for cache in map loop
|
19
|
+
@@mapper_class = nil
|
20
|
+
def self.create(script, key, value)
|
21
|
+
# once decide in map loop
|
22
|
+
unless @@mapper_class
|
23
|
+
dsl_name = self.dsl_name(script)
|
24
|
+
require_dsl_lib(dsl_name)
|
25
|
+
@@mapper_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Mapper")
|
26
|
+
end
|
27
|
+
|
28
|
+
@@mapper_class.new(script, key, value)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class ReducerFactory < MapRedFactory
|
33
|
+
@@reducer_class = nil
|
34
|
+
def self.create(script, key, values)
|
35
|
+
# once decide in reduce loop
|
36
|
+
unless @@reducer_class
|
37
|
+
dsl_name = self.dsl_name(script)
|
38
|
+
require_dsl_lib(dsl_name)
|
39
|
+
@@reducer_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Reducer")
|
40
|
+
end
|
41
|
+
|
42
|
+
@@reducer_class.new(script, key, values)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class SetupFactory < MapRedFactory
|
47
|
+
def self.create(script, conf)
|
48
|
+
dsl_name = self.dsl_name(script)
|
49
|
+
require_dsl_lib(dsl_name)
|
50
|
+
setup_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Setup"
|
51
|
+
eval(setup_class).new(script, conf) rescue HadoopDsl::BaseSetup.new(script, conf)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/util.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# utility functions
|
2
|
+
require 'hadoop_dsl'
|
3
|
+
|
4
|
+
module HadoopDsl
|
5
|
+
# file body cache
|
6
|
+
# reading file in map/reduce cause critical issues!
|
7
|
+
@@file_bodies = {}
|
8
|
+
|
9
|
+
def self.snake_case(str)
|
10
|
+
str.gsub(/\B[A-Z]/, '_\&').downcase
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.read_file(file_name)
|
14
|
+
# use if cached
|
15
|
+
body = @@file_bodies[file_name] if @@file_bodies[file_name]
|
16
|
+
|
17
|
+
# read as usual
|
18
|
+
body = File.open(file_name).read rescue nil unless body
|
19
|
+
|
20
|
+
# read from loadpath
|
21
|
+
unless body
|
22
|
+
$:.each do |path|
|
23
|
+
body = File.open(File.join(path, file_name)).read rescue next
|
24
|
+
break
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
raise "cannot find file - #{file_name}" unless body
|
29
|
+
|
30
|
+
# for cache
|
31
|
+
@@file_bodies[file_name] = body
|
32
|
+
body
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.reset_dsl_file
|
36
|
+
@@file_bodies = {}
|
37
|
+
end
|
38
|
+
end
|
data/lib/word_count.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'hadoop_dsl'
|
2
|
+
require 'enumerator'
|
3
|
+
|
4
|
+
module HadoopDsl::WordCount
|
5
|
+
MODEL_METHODS = []
|
6
|
+
TOTAL_PREFIX = "\t"
|
7
|
+
|
8
|
+
# controller
|
9
|
+
class WordCountMapper < HadoopDsl::BaseMapper
|
10
|
+
def initialize(script, key, value)
|
11
|
+
super(script, WordCountMapperModel.new(key, value))
|
12
|
+
end
|
13
|
+
|
14
|
+
# model methods
|
15
|
+
def_delegators :@model, *MODEL_METHODS
|
16
|
+
|
17
|
+
# emitters
|
18
|
+
def count_uniq
|
19
|
+
@model.value.split.each {|word| emit(word => 1)}
|
20
|
+
end
|
21
|
+
|
22
|
+
def total(*types)
|
23
|
+
types.each do |type|
|
24
|
+
case type
|
25
|
+
when :bytes
|
26
|
+
emit("#{TOTAL_PREFIX}total bytes" => @model.value.gsub(/\s/, '').length)
|
27
|
+
when :words
|
28
|
+
emit("#{TOTAL_PREFIX}total words" => @model.value.split.size)
|
29
|
+
when :lines
|
30
|
+
emit("#{TOTAL_PREFIX}total lines" => 1)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class WordCountReducer < HadoopDsl::BaseReducer
|
37
|
+
def initialize(script, key, values)
|
38
|
+
super(script, WordCountReducerModel.new(key, values))
|
39
|
+
end
|
40
|
+
|
41
|
+
# model methods
|
42
|
+
def_delegators :@model, *MODEL_METHODS
|
43
|
+
|
44
|
+
# emitters
|
45
|
+
def count_uniq; aggregate unless @model.total_value? end
|
46
|
+
def total(*types); aggregate if @model.total_value? end
|
47
|
+
end
|
48
|
+
|
49
|
+
# model
|
50
|
+
class WordCountMapperModel < HadoopDsl::BaseMapperModel
|
51
|
+
end
|
52
|
+
|
53
|
+
class WordCountReducerModel < HadoopDsl::BaseReducerModel
|
54
|
+
def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
|
55
|
+
end
|
56
|
+
end
|
data/spec/client_spec.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
|
+
require 'hadoop_dsl_client'
|
3
|
+
|
4
|
+
describe HadoopDsl::Client do
|
5
|
+
before do
|
6
|
+
@client = HadoopDsl::Client.new(["examples/wordcount.rb", "in", "out"])
|
7
|
+
end
|
8
|
+
|
9
|
+
it 'can parse args' do
|
10
|
+
@client.files.join.should match /ruby_wrapper\.rb/
|
11
|
+
@client.files.join.should match /dsl_init\.rb/
|
12
|
+
@client.files.should include 'examples/wordcount.rb'
|
13
|
+
@client.inputs.should == 'in'
|
14
|
+
@client.outputs.should == 'out'
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'can add dsl file into mapred args' do
|
18
|
+
@client.mapred_args.should ==
|
19
|
+
"--script dsl_init.rb in out --dslfile wordcount.rb"
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'can add dsl lib files' do
|
23
|
+
lib_path = HadoopDsl.lib_path
|
24
|
+
@client.files.should include File.join(lib_path, 'core.rb')
|
25
|
+
@client.files.should include File.join(lib_path, 'log_analysis.rb')
|
26
|
+
end
|
27
|
+
end
|
data/spec/core_spec.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
|
+
require 'core'
|
3
|
+
|
4
|
+
include HadoopDsl
|
5
|
+
|
6
|
+
describe 'BaseMapRed' do
|
7
|
+
before(:all) do
|
8
|
+
@script = create_tmp_script(<<-EOF)
|
9
|
+
from 'test/inputs'
|
10
|
+
to 'test/outputs'
|
11
|
+
EOF
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'emit key value' do
|
15
|
+
mapper = BaseMapper.new(@script, BaseMapperModel.new(nil, nil))
|
16
|
+
mapper.emit('key' => 'value')
|
17
|
+
mapper.emitted.should == [{'key' => 'value'}]
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'can run BaseMapper in minimum' do
|
21
|
+
model = BaseMapperModel.new('key', 'value')
|
22
|
+
mapper = BaseMapper.new(@script, model)
|
23
|
+
mapper.run
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'can run BaseReducer in minimum' do
|
27
|
+
model = BaseReducerModel.new('key', 'values')
|
28
|
+
reducer = BaseReducer.new(@script, model)
|
29
|
+
reducer.run
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'can run BaseSetup in minimum' do
|
33
|
+
setup = BaseSetup.new(@script, nil)
|
34
|
+
setup.run
|
35
|
+
end
|
36
|
+
|
37
|
+
describe BaseMapper do
|
38
|
+
it 'can emit as identity' do
|
39
|
+
model = BaseMapperModel.new('key', 'value')
|
40
|
+
mapper = BaseMapper.new(@script, model)
|
41
|
+
mapper.identity
|
42
|
+
|
43
|
+
mapper.emitted.should == [{'key' => 'value'}]
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe BaseReducer do
|
48
|
+
it 'can emit as aggregate' do
|
49
|
+
model = BaseReducerModel.new('key', [1, 2, 3])
|
50
|
+
reducer = BaseReducer.new(@script, model)
|
51
|
+
reducer.aggregate
|
52
|
+
|
53
|
+
reducer.emitted.should == [{'key' => 6}]
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'can emit as identity' do
|
57
|
+
model = BaseReducerModel.new('key', [1, 2, 3])
|
58
|
+
reducer = BaseReducer.new(@script, model)
|
59
|
+
reducer.identity
|
60
|
+
|
61
|
+
reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe BaseSetup do
|
66
|
+
it 'can get paths' do
|
67
|
+
setup = BaseSetup.new(@script, nil)
|
68
|
+
setup.run
|
69
|
+
setup.paths[0].should == 'test/inputs'
|
70
|
+
setup.paths[1].should == 'test/outputs'
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'dsl_init'
|
2
|
+
|
3
|
+
describe 'mapreduce init' do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
@script = create_tmp_script(<<-EOF)
|
7
|
+
dsl 'LogAnalysis'
|
8
|
+
data 'test' do
|
9
|
+
from 'test/inputs'
|
10
|
+
to 'test/outputs'
|
11
|
+
|
12
|
+
separate(" ")
|
13
|
+
column_name 'c0', 'c1', 'c2', 'c3'
|
14
|
+
topic 't1' do
|
15
|
+
count_uniq columns(:c1)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
EOF
|
19
|
+
end
|
20
|
+
|
21
|
+
before do
|
22
|
+
@one = 1
|
23
|
+
@output = mock('output')
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'can map sucessfully' do
|
27
|
+
key = 'key'
|
28
|
+
value = 'it should be fine'
|
29
|
+
@output.should_receive(:collect).once #.with(@text, @one)
|
30
|
+
|
31
|
+
map(key, value, @output, nil, @script)
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'can reduce sucessfully' do
|
35
|
+
key = "t1\tkey"
|
36
|
+
values = [@one, @one, @one]
|
37
|
+
@output.should_receive(:collect).once #.with(@text, @one)
|
38
|
+
|
39
|
+
reduce(key, values, @output, nil, @script)
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'can set job conf' do
|
43
|
+
conf = mock('jobconf')
|
44
|
+
paths = setup(conf, @script)
|
45
|
+
|
46
|
+
paths[0].should == 'test/inputs'
|
47
|
+
paths[1].should == 'test/outputs'
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'log_analysis'
|
2
|
+
require 'word_count'
|
3
|
+
require 'hive_like'
|
4
|
+
|
5
|
+
include HadoopDsl::LogAnalysis
|
6
|
+
describe 'Aapach Log Example' do
|
7
|
+
before(:all) do
|
8
|
+
@script = File.join(File.dirname(__FILE__), '..', 'examples', 'log_analysis_test.rb')
|
9
|
+
@bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
10
|
+
@value = %Q!127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "-" "#{@bot_ua}"!
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'can run example by mapper' do
|
14
|
+
mapper = LogAnalysisMapper.new(@script, nil, @value)
|
15
|
+
mapper.run
|
16
|
+
mapper.emitted.first.should == {"ua\t#{@bot_ua}" => 1}
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'can run example by reducer' do
|
20
|
+
reducer = LogAnalysisReducer.new(@script, "ua\tChrome", [1, 1, 1])
|
21
|
+
reducer.run
|
22
|
+
reducer.emitted.first["ua\tChrome"].should == 3
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
include HadoopDsl::WordCount
|
27
|
+
describe 'Word Count Example' do
|
28
|
+
before(:all) do
|
29
|
+
@script = File.join(File.dirname(__FILE__), '..', 'examples', 'word_count_test.rb')
|
30
|
+
@value = 'Lorem ipsum ipsum Lorem sit amet,'
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'can run example by mapper' do
|
34
|
+
mapper = WordCountMapper.new(@script, nil, @value)
|
35
|
+
mapper.run
|
36
|
+
mapper.emitted.size.should == 9
|
37
|
+
mapper.emitted.each do |e|
|
38
|
+
case e.keys.first
|
39
|
+
when 'Lorem'
|
40
|
+
e.values.first.should == 1
|
41
|
+
when 'total words'
|
42
|
+
e.values.first.should == 6
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'can run example by reducer' do
|
48
|
+
reducer = WordCountReducer.new(@script, "Lorem", [1, 1, 1])
|
49
|
+
reducer.run
|
50
|
+
reducer.emitted.first["Lorem"].should == 3
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
include HadoopDsl::HiveLike
|
55
|
+
describe 'Hive Like Example' do
|
56
|
+
before(:all) do
|
57
|
+
@script = File.join(File.dirname(__FILE__), '..', 'examples', 'hive_like_test.rb')
|
58
|
+
@value = 'apple, 3, 100'
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'can run setup' do
|
62
|
+
conf = mock('conf')
|
63
|
+
conf.should_receive(:output_key_class=).once
|
64
|
+
conf.should_receive(:output_value_class=).once
|
65
|
+
|
66
|
+
setup = HiveLikeSetup.new(@script, conf)
|
67
|
+
setup.run
|
68
|
+
setup.paths[0].should == 'hive-like/items.txt'
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'can run example by mapper' do
|
72
|
+
mapper = HiveLikeMapper.new(@script, nil, @value)
|
73
|
+
mapper.run
|
74
|
+
mapper.emitted.size.should == 1
|
75
|
+
mapper.emitted.first['items'].should == '3, 100, apple'
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'can run example by reducer' do
|
79
|
+
values = ['v1', 'v2', 'v3']
|
80
|
+
reducer = HiveLikeReducer.new(@script, "items", values)
|
81
|
+
reducer.run
|
82
|
+
reducer.emitted.first["items"].should == 'v1'
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
|
+
require 'hive_like'
|
3
|
+
|
4
|
+
include HadoopDsl::HiveLike
|
5
|
+
|
6
|
+
describe HiveLikeSetup do
|
7
|
+
it 'should load data' do
|
8
|
+
script = create_tmp_script(%Q!load_data "hive-like/inputs", items;!)
|
9
|
+
conf = mock('conf')
|
10
|
+
conf.should_receive(:output_key_class=).once
|
11
|
+
conf.should_receive(:output_value_class=).once
|
12
|
+
|
13
|
+
setup = HiveLikeSetup.new(script, conf)
|
14
|
+
setup.run
|
15
|
+
setup.paths[0].should == 'hive-like/inputs'
|
16
|
+
setup.paths[1].should == 'hive-like/outputs'
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe HiveLikeMapper do
|
21
|
+
before do
|
22
|
+
@value = 'apple, 3, 100'
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'should create table' do
|
26
|
+
mapper = HiveLikeMapper.new(nil, nil, @value)
|
27
|
+
mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
|
28
|
+
mapper.table.name.should == 'items'
|
29
|
+
mapper.table.column(0).should == 'item'
|
30
|
+
mapper.table.column(1).should == 'quantity'
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'should select' do
|
34
|
+
mapper = HiveLikeMapper.new(nil, nil, @value)
|
35
|
+
mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
|
36
|
+
mapper.select("item", "quantity", "price", "from", "items")
|
37
|
+
mapper.emitted.first.should == {'items' => 'apple, 3, 100'}
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should pre process script body' do
|
41
|
+
body = "select foo, bar from table;\n"
|
42
|
+
mapper = HiveLikeMapper.new(nil, nil, @value)
|
43
|
+
processed = mapper.pre_process(body)
|
44
|
+
processed.should == %Q!select("foo", "bar", "from", "table")\n!
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe HiveLikeReducer do
|
49
|
+
it 'should select as identity' do
|
50
|
+
key = 'Lorem'
|
51
|
+
values = [1, 1, 1]
|
52
|
+
reducer = HiveLikeReducer.new(nil, key, values)
|
53
|
+
|
54
|
+
reducer.select
|
55
|
+
reducer.emitted[0].should == {'Lorem' => 1}
|
56
|
+
end
|
57
|
+
end
|