hadoop-papyrus 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,54 @@
1
+ require 'hadoop_dsl'
2
+
3
+ module HadoopDsl
4
+ class MapRedFactory
5
+ def self.dsl_name(script)
6
+ HadoopDsl.read_file(script).each_line do |line|
7
+ dsl_name = $1 if line =~ /\s*dsl\s*\(?["'](\w*)["']\)?/
8
+ return dsl_name if dsl_name
9
+ end
10
+ end
11
+
12
+ def self.require_dsl_lib(dsl_name)
13
+ require HadoopDsl.snake_case(dsl_name)
14
+ end
15
+ end
16
+
17
+ class MapperFactory < MapRedFactory
18
+ # for cache in map loop
19
+ @@mapper_class = nil
20
+ def self.create(script, key, value)
21
+ # once decide in map loop
22
+ unless @@mapper_class
23
+ dsl_name = self.dsl_name(script)
24
+ require_dsl_lib(dsl_name)
25
+ @@mapper_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Mapper")
26
+ end
27
+
28
+ @@mapper_class.new(script, key, value)
29
+ end
30
+ end
31
+
32
+ class ReducerFactory < MapRedFactory
33
+ @@reducer_class = nil
34
+ def self.create(script, key, values)
35
+ # once decide in reduce loop
36
+ unless @@reducer_class
37
+ dsl_name = self.dsl_name(script)
38
+ require_dsl_lib(dsl_name)
39
+ @@reducer_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Reducer")
40
+ end
41
+
42
+ @@reducer_class.new(script, key, values)
43
+ end
44
+ end
45
+
46
+ class SetupFactory < MapRedFactory
47
+ def self.create(script, conf)
48
+ dsl_name = self.dsl_name(script)
49
+ require_dsl_lib(dsl_name)
50
+ setup_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Setup"
51
+ eval(setup_class).new(script, conf) rescue HadoopDsl::BaseSetup.new(script, conf)
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,38 @@
1
+ # utility functions
2
+ require 'hadoop_dsl'
3
+
4
+ module HadoopDsl
5
+ # file body cache
6
+ # reading file in map/reduce cause critical issues!
7
+ @@file_bodies = {}
8
+
9
+ def self.snake_case(str)
10
+ str.gsub(/\B[A-Z]/, '_\&').downcase
11
+ end
12
+
13
+ def self.read_file(file_name)
14
+ # use if cached
15
+ body = @@file_bodies[file_name] if @@file_bodies[file_name]
16
+
17
+ # read as usual
18
+ body = File.open(file_name).read rescue nil unless body
19
+
20
+ # read from loadpath
21
+ unless body
22
+ $:.each do |path|
23
+ body = File.open(File.join(path, file_name)).read rescue next
24
+ break
25
+ end
26
+ end
27
+
28
+ raise "cannot find file - #{file_name}" unless body
29
+
30
+ # for cache
31
+ @@file_bodies[file_name] = body
32
+ body
33
+ end
34
+
35
+ def self.reset_dsl_file
36
+ @@file_bodies = {}
37
+ end
38
+ end
@@ -0,0 +1,56 @@
1
+ require 'hadoop_dsl'
2
+ require 'enumerator'
3
+
4
+ module HadoopDsl::WordCount
5
+ MODEL_METHODS = []
6
+ TOTAL_PREFIX = "\t"
7
+
8
+ # controller
9
+ class WordCountMapper < HadoopDsl::BaseMapper
10
+ def initialize(script, key, value)
11
+ super(script, WordCountMapperModel.new(key, value))
12
+ end
13
+
14
+ # model methods
15
+ def_delegators :@model, *MODEL_METHODS
16
+
17
+ # emitters
18
+ def count_uniq
19
+ @model.value.split.each {|word| emit(word => 1)}
20
+ end
21
+
22
+ def total(*types)
23
+ types.each do |type|
24
+ case type
25
+ when :bytes
26
+ emit("#{TOTAL_PREFIX}total bytes" => @model.value.gsub(/\s/, '').length)
27
+ when :words
28
+ emit("#{TOTAL_PREFIX}total words" => @model.value.split.size)
29
+ when :lines
30
+ emit("#{TOTAL_PREFIX}total lines" => 1)
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ class WordCountReducer < HadoopDsl::BaseReducer
37
+ def initialize(script, key, values)
38
+ super(script, WordCountReducerModel.new(key, values))
39
+ end
40
+
41
+ # model methods
42
+ def_delegators :@model, *MODEL_METHODS
43
+
44
+ # emitters
45
+ def count_uniq; aggregate unless @model.total_value? end
46
+ def total(*types); aggregate if @model.total_value? end
47
+ end
48
+
49
+ # model
50
+ class WordCountMapperModel < HadoopDsl::BaseMapperModel
51
+ end
52
+
53
+ class WordCountReducerModel < HadoopDsl::BaseReducerModel
54
+ def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
55
+ end
56
+ end
@@ -0,0 +1,27 @@
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+ require 'hadoop_dsl_client'
3
+
4
+ describe HadoopDsl::Client do
5
+ before do
6
+ @client = HadoopDsl::Client.new(["examples/wordcount.rb", "in", "out"])
7
+ end
8
+
9
+ it 'can parse args' do
10
+ @client.files.join.should match /ruby_wrapper\.rb/
11
+ @client.files.join.should match /dsl_init\.rb/
12
+ @client.files.should include 'examples/wordcount.rb'
13
+ @client.inputs.should == 'in'
14
+ @client.outputs.should == 'out'
15
+ end
16
+
17
+ it 'can add dsl file into mapred args' do
18
+ @client.mapred_args.should ==
19
+ "--script dsl_init.rb in out --dslfile wordcount.rb"
20
+ end
21
+
22
+ it 'can add dsl lib files' do
23
+ lib_path = HadoopDsl.lib_path
24
+ @client.files.should include File.join(lib_path, 'core.rb')
25
+ @client.files.should include File.join(lib_path, 'log_analysis.rb')
26
+ end
27
+ end
@@ -0,0 +1,73 @@
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+ require 'core'
3
+
4
+ include HadoopDsl
5
+
6
+ describe 'BaseMapRed' do
7
+ before(:all) do
8
+ @script = create_tmp_script(<<-EOF)
9
+ from 'test/inputs'
10
+ to 'test/outputs'
11
+ EOF
12
+ end
13
+
14
+ it 'emit key value' do
15
+ mapper = BaseMapper.new(@script, BaseMapperModel.new(nil, nil))
16
+ mapper.emit('key' => 'value')
17
+ mapper.emitted.should == [{'key' => 'value'}]
18
+ end
19
+
20
+ it 'can run BaseMapper in minimum' do
21
+ model = BaseMapperModel.new('key', 'value')
22
+ mapper = BaseMapper.new(@script, model)
23
+ mapper.run
24
+ end
25
+
26
+ it 'can run BaseReducer in minimum' do
27
+ model = BaseReducerModel.new('key', 'values')
28
+ reducer = BaseReducer.new(@script, model)
29
+ reducer.run
30
+ end
31
+
32
+ it 'can run BaseSetup in minimum' do
33
+ setup = BaseSetup.new(@script, nil)
34
+ setup.run
35
+ end
36
+
37
+ describe BaseMapper do
38
+ it 'can emit as identity' do
39
+ model = BaseMapperModel.new('key', 'value')
40
+ mapper = BaseMapper.new(@script, model)
41
+ mapper.identity
42
+
43
+ mapper.emitted.should == [{'key' => 'value'}]
44
+ end
45
+ end
46
+
47
+ describe BaseReducer do
48
+ it 'can emit as aggregate' do
49
+ model = BaseReducerModel.new('key', [1, 2, 3])
50
+ reducer = BaseReducer.new(@script, model)
51
+ reducer.aggregate
52
+
53
+ reducer.emitted.should == [{'key' => 6}]
54
+ end
55
+
56
+ it 'can emit as identity' do
57
+ model = BaseReducerModel.new('key', [1, 2, 3])
58
+ reducer = BaseReducer.new(@script, model)
59
+ reducer.identity
60
+
61
+ reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}]
62
+ end
63
+ end
64
+
65
+ describe BaseSetup do
66
+ it 'can get paths' do
67
+ setup = BaseSetup.new(@script, nil)
68
+ setup.run
69
+ setup.paths[0].should == 'test/inputs'
70
+ setup.paths[1].should == 'test/outputs'
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,49 @@
1
+ require 'dsl_init'
2
+
3
+ describe 'mapreduce init' do
4
+
5
+ before(:all) do
6
+ @script = create_tmp_script(<<-EOF)
7
+ dsl 'LogAnalysis'
8
+ data 'test' do
9
+ from 'test/inputs'
10
+ to 'test/outputs'
11
+
12
+ separate(" ")
13
+ column_name 'c0', 'c1', 'c2', 'c3'
14
+ topic 't1' do
15
+ count_uniq columns(:c1)
16
+ end
17
+ end
18
+ EOF
19
+ end
20
+
21
+ before do
22
+ @one = 1
23
+ @output = mock('output')
24
+ end
25
+
26
+ it 'can map sucessfully' do
27
+ key = 'key'
28
+ value = 'it should be fine'
29
+ @output.should_receive(:collect).once #.with(@text, @one)
30
+
31
+ map(key, value, @output, nil, @script)
32
+ end
33
+
34
+ it 'can reduce sucessfully' do
35
+ key = "t1\tkey"
36
+ values = [@one, @one, @one]
37
+ @output.should_receive(:collect).once #.with(@text, @one)
38
+
39
+ reduce(key, values, @output, nil, @script)
40
+ end
41
+
42
+ it 'can set job conf' do
43
+ conf = mock('jobconf')
44
+ paths = setup(conf, @script)
45
+
46
+ paths[0].should == 'test/inputs'
47
+ paths[1].should == 'test/outputs'
48
+ end
49
+ end
@@ -0,0 +1,84 @@
1
+ require 'log_analysis'
2
+ require 'word_count'
3
+ require 'hive_like'
4
+
5
+ include HadoopDsl::LogAnalysis
6
+ describe 'Aapach Log Example' do
7
+ before(:all) do
8
+ @script = File.join(File.dirname(__FILE__), '..', 'examples', 'log_analysis_test.rb')
9
+ @bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
10
+ @value = %Q!127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "-" "#{@bot_ua}"!
11
+ end
12
+
13
+ it 'can run example by mapper' do
14
+ mapper = LogAnalysisMapper.new(@script, nil, @value)
15
+ mapper.run
16
+ mapper.emitted.first.should == {"ua\t#{@bot_ua}" => 1}
17
+ end
18
+
19
+ it 'can run example by reducer' do
20
+ reducer = LogAnalysisReducer.new(@script, "ua\tChrome", [1, 1, 1])
21
+ reducer.run
22
+ reducer.emitted.first["ua\tChrome"].should == 3
23
+ end
24
+ end
25
+
26
+ include HadoopDsl::WordCount
27
+ describe 'Word Count Example' do
28
+ before(:all) do
29
+ @script = File.join(File.dirname(__FILE__), '..', 'examples', 'word_count_test.rb')
30
+ @value = 'Lorem ipsum ipsum Lorem sit amet,'
31
+ end
32
+
33
+ it 'can run example by mapper' do
34
+ mapper = WordCountMapper.new(@script, nil, @value)
35
+ mapper.run
36
+ mapper.emitted.size.should == 9
37
+ mapper.emitted.each do |e|
38
+ case e.keys.first
39
+ when 'Lorem'
40
+ e.values.first.should == 1
41
+ when 'total words'
42
+ e.values.first.should == 6
43
+ end
44
+ end
45
+ end
46
+
47
+ it 'can run example by reducer' do
48
+ reducer = WordCountReducer.new(@script, "Lorem", [1, 1, 1])
49
+ reducer.run
50
+ reducer.emitted.first["Lorem"].should == 3
51
+ end
52
+ end
53
+
54
+ include HadoopDsl::HiveLike
55
+ describe 'Hive Like Example' do
56
+ before(:all) do
57
+ @script = File.join(File.dirname(__FILE__), '..', 'examples', 'hive_like_test.rb')
58
+ @value = 'apple, 3, 100'
59
+ end
60
+
61
+ it 'can run setup' do
62
+ conf = mock('conf')
63
+ conf.should_receive(:output_key_class=).once
64
+ conf.should_receive(:output_value_class=).once
65
+
66
+ setup = HiveLikeSetup.new(@script, conf)
67
+ setup.run
68
+ setup.paths[0].should == 'hive-like/items.txt'
69
+ end
70
+
71
+ it 'can run example by mapper' do
72
+ mapper = HiveLikeMapper.new(@script, nil, @value)
73
+ mapper.run
74
+ mapper.emitted.size.should == 1
75
+ mapper.emitted.first['items'].should == '3, 100, apple'
76
+ end
77
+
78
+ it 'can run example by reducer' do
79
+ values = ['v1', 'v2', 'v3']
80
+ reducer = HiveLikeReducer.new(@script, "items", values)
81
+ reducer.run
82
+ reducer.emitted.first["items"].should == 'v1'
83
+ end
84
+ end
@@ -0,0 +1,57 @@
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+ require 'hive_like'
3
+
4
+ include HadoopDsl::HiveLike
5
+
6
+ describe HiveLikeSetup do
7
+ it 'should load data' do
8
+ script = create_tmp_script(%Q!load_data "hive-like/inputs", items;!)
9
+ conf = mock('conf')
10
+ conf.should_receive(:output_key_class=).once
11
+ conf.should_receive(:output_value_class=).once
12
+
13
+ setup = HiveLikeSetup.new(script, conf)
14
+ setup.run
15
+ setup.paths[0].should == 'hive-like/inputs'
16
+ setup.paths[1].should == 'hive-like/outputs'
17
+ end
18
+ end
19
+
20
+ describe HiveLikeMapper do
21
+ before do
22
+ @value = 'apple, 3, 100'
23
+ end
24
+
25
+ it 'should create table' do
26
+ mapper = HiveLikeMapper.new(nil, nil, @value)
27
+ mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
28
+ mapper.table.name.should == 'items'
29
+ mapper.table.column(0).should == 'item'
30
+ mapper.table.column(1).should == 'quantity'
31
+ end
32
+
33
+ it 'should select' do
34
+ mapper = HiveLikeMapper.new(nil, nil, @value)
35
+ mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
36
+ mapper.select("item", "quantity", "price", "from", "items")
37
+ mapper.emitted.first.should == {'items' => 'apple, 3, 100'}
38
+ end
39
+
40
+ it 'should pre process script body' do
41
+ body = "select foo, bar from table;\n"
42
+ mapper = HiveLikeMapper.new(nil, nil, @value)
43
+ processed = mapper.pre_process(body)
44
+ processed.should == %Q!select("foo", "bar", "from", "table")\n!
45
+ end
46
+ end
47
+
48
+ describe HiveLikeReducer do
49
+ it 'should select as identity' do
50
+ key = 'Lorem'
51
+ values = [1, 1, 1]
52
+ reducer = HiveLikeReducer.new(nil, key, values)
53
+
54
+ reducer.select
55
+ reducer.emitted[0].should == {'Lorem' => 1}
56
+ end
57
+ end