hadoop-rubydsl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/hive_like.rb ADDED
@@ -0,0 +1,122 @@
1
+ require 'core'
2
+ require 'enumerator'
3
+
4
+ module HadoopDsl::HiveLike
5
+ include HadoopDsl
6
+
7
+ AVAILABLE_METHODS = [:select, :create_table, :table]
8
+
9
+ # common
10
+ module HiveLikeMapRed
11
+ def pre_process(body)
12
+ processed = ""
13
+ body.each do |line|
14
+ next if line =~ /^#/
15
+ if line =~ /^(\w*)\s+(.*);$/
16
+ method = $1
17
+ args = sprit_and_marge_args($2)
18
+ processed << "#{method}(#{args})\n"
19
+ else
20
+ processed << line + "\n"
21
+ end
22
+ end
23
+ processed
24
+ end
25
+
26
+ def sprit_and_marge_args(raw)
27
+ raw.gsub(/[\(\)]/, ' ').split.map do |s|
28
+ stripped = s.gsub(/[\s,"']/, '')
29
+ %Q!"#{stripped}"!
30
+ end.join(", ")
31
+ end
32
+ end
33
+
34
+ # controller
35
+ class HiveLikeSetup < BaseSetup
36
+ def load_data(inputs, table)
37
+ @from = inputs
38
+ @to = inputs.gsub(/#{File.basename(inputs)}$/, 'outputs')
39
+ end
40
+
41
+ def output_format
42
+ @conf.output_key_class = Text
43
+ @conf.output_value_class = Text
44
+ end
45
+
46
+ # might not need but occur error if not exists
47
+ def select(*args) end
48
+
49
+ include HiveLikeMapRed
50
+ end
51
+
52
+ class HiveLikeMapper < BaseMapper
53
+ def initialize(script, key, value)
54
+ super(script, HiveLikeMapperModel.new(key, value))
55
+ end
56
+
57
+ include HiveLikeMapRed
58
+
59
+ # model methods
60
+ def_delegators :@model, *AVAILABLE_METHODS
61
+ end
62
+
63
+ class HiveLikeReducer < BaseReducer
64
+ def initialize(script, key, values)
65
+ super(script, HiveLikeReducerModel.new(key, values))
66
+ end
67
+
68
+ include HiveLikeMapRed
69
+
70
+ # model methods
71
+ def_delegators :@model, *AVAILABLE_METHODS
72
+ end
73
+
74
+ # model
75
+ class HiveLikeMapperModel < BaseMapperModel
76
+ attr_reader :table
77
+
78
+ def initialize(key, value)
79
+ super(key, value)
80
+ end
81
+
82
+ # emitters
83
+ def create_table(name, *column_and_type)
84
+ @table = Table.new(name)
85
+ column_and_type.each_with_index do |column, index|
86
+ next if index % 2 != 0 # type
87
+ @table.columns << column_and_type[index]
88
+ end
89
+ end
90
+
91
+ def select(*args)
92
+ from_index = args.index('from')
93
+ if from_index
94
+ values = args[0...from_index].map do |column|
95
+ splitted = @value.split(/[,\s]+/)
96
+ splitted[@table.columns.index(column)]
97
+ end
98
+ @controller.emit(args[from_index + 1] => values.join(", "))
99
+ end
100
+ end
101
+
102
+ class Table
103
+ attr_reader :name, :columns
104
+
105
+ def initialize(name)
106
+ @name = name
107
+ @columns = []
108
+ end
109
+
110
+ def column(index) @columns[index] end
111
+ end
112
+ end
113
+
114
+ class HiveLikeReducerModel < BaseReducerModel
115
+ def initialize(key, values)
116
+ super(key, values)
117
+ end
118
+
119
+ # emitters
120
+ def select(*args) identity end
121
+ end
122
+ end
data/lib/init.rb ADDED
@@ -0,0 +1,60 @@
1
+ require 'core'
2
+ require 'java'
3
+ require 'mapred_factory'
4
+
5
+ import 'org.apache.hadoop.io.IntWritable'
6
+ import 'org.apache.hadoop.io.Text'
7
+
8
+ include HadoopDsl
9
+
10
+ # Hadoop IO types
11
+ HadoopDsl::Text = Text
12
+ HadoopDsl::IntWritable = IntWritable
13
+
14
+ def map(key, value, output, reporter, script)
15
+ mapper = MapperFactory.create(script, key.to_string, value.to_string)
16
+ mapper.run
17
+
18
+ write(output, mapper)
19
+ end
20
+
21
+ def reduce(key, values, output, reporter, script)
22
+ ruby_values = values.map {|v| to_ruby(v)}
23
+ reducer = ReducerFactory.create(script, key.to_string, ruby_values)
24
+ reducer.run
25
+
26
+ write(output, reducer)
27
+ end
28
+
29
+ def setup(conf, script)
30
+ setup = SetupFactory.create(script, conf)
31
+ setup.run
32
+
33
+ setup.paths.to_java
34
+ end
35
+
36
+ private
37
+
38
+ def write(output, controller)
39
+ controller.emitted.each do |e|
40
+ e.each do |k, v|
41
+ output.collect(to_hadoop(k), to_hadoop(v))
42
+ end
43
+ end
44
+ end
45
+
46
+ def to_ruby(value)
47
+ case value
48
+ when IntWritable then value.get
49
+ when Text then value.to_string
50
+ else raise "no match class: #{value.class}"
51
+ end
52
+ end
53
+
54
+ def to_hadoop(value)
55
+ case value
56
+ when Integer then IntWritable.new(value)
57
+ when String then t = Text.new; t.set(value); t
58
+ else raise "no match class: #{value.class}"
59
+ end
60
+ end
@@ -0,0 +1 @@
1
+ jruby-complete-*.jar
Binary file
@@ -0,0 +1,165 @@
1
+ require 'core'
2
+ require 'enumerator'
3
+
4
+ module HadoopDsl::LogAnalysis
5
+ include HadoopDsl
6
+
7
+ KEY_SEP = "\t"
8
+ PREFIX = 'col'
9
+ PASS = nil
10
+ AVAILABLE_METHODS = [:separate, :pattern, :column_name, :column, :topic, :value, :count_uniq, :sum]
11
+
12
+ # common
13
+ module LogAnalysisMapRed
14
+ # entry point
15
+ def data(description = '', &block) yield end
16
+
17
+ def each_line(&block) yield end
18
+ end
19
+
20
+ # controller
21
+ class LogAnalysisSetup < BaseSetup
22
+ def initialize(script, conf)
23
+ super(script, conf)
24
+ end
25
+
26
+ include LogAnalysisMapRed
27
+ end
28
+
29
+ class LogAnalysisMapper < BaseMapper
30
+ def initialize(script, key, value)
31
+ super(script, LogAnalysisMapperModel.new(key, value))
32
+ end
33
+
34
+ include LogAnalysisMapRed
35
+
36
+ # model methods
37
+ def_delegators :@model, *AVAILABLE_METHODS
38
+ end
39
+
40
+ class LogAnalysisReducer < BaseReducer
41
+ def initialize(script, key, values)
42
+ super(script, LogAnalysisReducerModel.new(key, values))
43
+ end
44
+
45
+ include LogAnalysisMapRed
46
+
47
+ # model methods
48
+ def_delegators :@model, *AVAILABLE_METHODS
49
+ end
50
+
51
+ # model
52
+ class LogAnalysisMapperModel < BaseMapperModel
53
+ def initialize(key, value)
54
+ super(key, value)
55
+ @columns = ColumnArray.new
56
+ @topics = []
57
+ end
58
+
59
+ def column; @columns end
60
+
61
+ def topic(desc, options = {}, &block)
62
+ @topics << @current_topic = Topic.new(desc, options[:label])
63
+ yield if block_given?
64
+ @current_topic
65
+ end
66
+
67
+ def separate(sep)
68
+ parts = @value.split(sep)
69
+ create_or_replace_columns_with(parts) {|column, value| column.value = value}
70
+ end
71
+
72
+ def pattern(re)
73
+ if @value =~ re
74
+ md = Regexp.last_match
75
+ create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
76
+ end
77
+ end
78
+
79
+ # column names by String converted to Symbol
80
+ def column_name(*names)
81
+ sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
82
+ create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
83
+ end
84
+
85
+ def create_or_replace_columns_with(array, &block)
86
+ columns = array.enum_for(:each_with_index).map do |p, i|
87
+ c = @columns[i] ? @columns[i] : Column.new(i)
88
+ yield c, p
89
+ c
90
+ end
91
+ @columns = ColumnArray.new(columns)
92
+ end
93
+
94
+ # emitters
95
+ def count_uniq(column)
96
+ @controller.emit([@current_topic.label, KEY_SEP, column.value].join => 1)
97
+ end
98
+
99
+ def sum(column)
100
+ @controller.emit([@current_topic.label].join => column.value.to_i)
101
+ end
102
+
103
+ class ColumnArray < Array
104
+ def [](key)
105
+ case key
106
+ when Integer then at(key)
107
+ when Symbol then (select {|c| c.name == key}).first
108
+ when String then (select {|c| c.name == key.to_sym}).first
109
+ end
110
+ end
111
+ end
112
+
113
+ class Column
114
+ attr_reader :index
115
+ attr_accessor :value, :name
116
+
117
+ def initialize(index, value = nil)
118
+ @index, @value = index, value
119
+ end
120
+ end
121
+
122
+ class Topic
123
+ def initialize(desc, label = nil)
124
+ @desc, @label = desc, label
125
+ end
126
+
127
+ def label
128
+ @label || @desc.gsub(/\s/, '_')
129
+ end
130
+ end
131
+ end
132
+
133
+ class LogAnalysisReducerModel < BaseReducerModel
134
+ def initialize(key, values)
135
+ super(key, values)
136
+ if key =~ /(\w*)#{KEY_SEP}?(.*)/
137
+ @topic = Topic.new($1, values)
138
+ end
139
+ end
140
+
141
+ def topic(desc, options = {}, &block)
142
+ @current_topic = Topic.new(options[:label] || desc.gsub(/\s/, '_'), nil)
143
+ yield if block_given?
144
+ @current_topic
145
+ end
146
+
147
+ def count_uniq(column)
148
+ aggregate if @topic == @current_topic
149
+ end
150
+
151
+ def sum(column)
152
+ aggregate if @topic == @current_topic
153
+ end
154
+
155
+ class Topic
156
+ attr_reader :label, :values
157
+
158
+ def initialize(label, values)
159
+ @label, @values = label, values
160
+ end
161
+
162
+ def ==(rh) self.label == rh.label end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,43 @@
1
+ require 'util'
2
+
3
+ module HadoopDsl
4
+ class MapRedFactory
5
+ def self.dsl_name(script)
6
+ read_file(script).each_line do |line|
7
+ dsl_name = $1 if line =~ /^use\s*'(\w*)'/
8
+ return dsl_name
9
+ end
10
+ end
11
+
12
+ def self.require_dsl_lib(dsl_name)
13
+ require snake_case(dsl_name)
14
+ end
15
+ end
16
+
17
+ class MapperFactory < MapRedFactory
18
+ def self.create(script, key, value)
19
+ dsl_name = self.dsl_name(script)
20
+ require_dsl_lib(dsl_name)
21
+ mapper_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Mapper"
22
+ return eval(mapper_class).new(script, key, value)
23
+ end
24
+ end
25
+
26
+ class ReducerFactory < MapRedFactory
27
+ def self.create(script, key, values)
28
+ dsl_name = self.dsl_name(script)
29
+ require_dsl_lib(dsl_name)
30
+ reducer_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Reducer"
31
+ return eval(reducer_class).new(script, key, values)
32
+ end
33
+ end
34
+
35
+ class SetupFactory < MapRedFactory
36
+ def self.create(script, conf)
37
+ dsl_name = self.dsl_name(script)
38
+ require_dsl_lib(dsl_name)
39
+ setup_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Setup"
40
+ eval(setup_class).new(script, conf) rescue HadoopDsl::BaseSetup.new(script, conf)
41
+ end
42
+ end
43
+ end
data/lib/util.rb ADDED
@@ -0,0 +1,11 @@
1
+ # utility functions
2
+
3
+ module HadoopDsl
4
+ def snake_case(str)
5
+ str.gsub(/\B[A-Z]/, '_\&').downcase
6
+ end
7
+
8
+ def read_file(file_name)
9
+ File.open(file_name).read
10
+ end
11
+ end
data/lib/word_count.rb ADDED
@@ -0,0 +1,76 @@
1
+ require 'core'
2
+ require 'enumerator'
3
+
4
+ module HadoopDsl::WordCount
5
+ include HadoopDsl
6
+
7
+ AVAILABLE_METHODS = [:count_uniq, :total]
8
+ TOTAL_PREFIX = "\t"
9
+
10
+ # common
11
+ module WordCountMapRed
12
+ # entry point
13
+ def data(description = '', &block) yield end
14
+ end
15
+
16
+ # controller
17
+ class WordCountMapper < BaseMapper
18
+ def initialize(script, key, value)
19
+ super(script, WordCountMapperModel.new(key, value))
20
+ end
21
+
22
+ include WordCountMapRed
23
+
24
+ # model methods
25
+ def_delegators :@model, *AVAILABLE_METHODS
26
+ end
27
+
28
+ class WordCountReducer < BaseReducer
29
+ def initialize(script, key, values)
30
+ super(script, WordCountReducerModel.new(key, values))
31
+ end
32
+
33
+ include WordCountMapRed
34
+
35
+ # model methods
36
+ def_delegators :@model, *AVAILABLE_METHODS
37
+ end
38
+
39
+ # model
40
+ class WordCountMapperModel < BaseMapperModel
41
+ def initialize(key, value)
42
+ super(key, value)
43
+ end
44
+
45
+ # emitters
46
+ def count_uniq
47
+ @value.split.each {|word| @controller.emit(word => 1)}
48
+ end
49
+
50
+ def total(*types)
51
+ types.each do |type|
52
+ case type
53
+ when :bytes
54
+ @controller.emit("#{TOTAL_PREFIX}total bytes" => @value.gsub(/\s/, '').length)
55
+ when :words
56
+ @controller.emit("#{TOTAL_PREFIX}total words" => @value.split.size)
57
+ when :lines
58
+ @controller.emit("#{TOTAL_PREFIX}total lines" => 1)
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ class WordCountReducerModel < BaseReducerModel
65
+ def initialize(key, values)
66
+ super(key, values)
67
+ end
68
+
69
+ # emitters
70
+ def count_uniq; aggregate unless total_value? end
71
+ def total(*types); aggregate if total_value? end
72
+
73
+ private
74
+ def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
75
+ end
76
+ end
data/spec/core_spec.rb ADDED
@@ -0,0 +1,73 @@
1
+ require 'init'
2
+ require 'core'
3
+
4
+ include HadoopDsl
5
+
6
+ describe 'BaseMapRed' do
7
+ before(:all) do
8
+ @script = create_tmp_script(<<-EOF)
9
+ from 'test/inputs'
10
+ to 'test/outputs'
11
+ EOF
12
+ end
13
+
14
+ it 'emit key value' do
15
+ mapper = BaseMapper.new(@script, BaseMapperModel.new(nil, nil))
16
+ mapper.emit('key' => 'value')
17
+ mapper.emitted.should == [{'key' => 'value'}]
18
+ end
19
+
20
+ it 'can run BaseMapper in minimum' do
21
+ model = BaseMapperModel.new('key', 'value')
22
+ mapper = BaseMapper.new(@script, model)
23
+ mapper.run
24
+ end
25
+
26
+ it 'can run BaseReducer in minimum' do
27
+ model = BaseReducerModel.new('key', 'values')
28
+ reducer = BaseReducer.new(@script, model)
29
+ reducer.run
30
+ end
31
+
32
+ it 'can run BaseSetup in minimum' do
33
+ setup = BaseSetup.new(@script, nil)
34
+ setup.run
35
+ end
36
+
37
+ describe BaseMapper do
38
+ it 'can emit as identity' do
39
+ model = BaseMapperModel.new('key', 'value')
40
+ mapper = BaseMapper.new(@script, model)
41
+ model.identity
42
+
43
+ mapper.emitted.should == [{'key' => 'value'}]
44
+ end
45
+ end
46
+
47
+ describe BaseReducer do
48
+ it 'can emit as aggregate' do
49
+ model = BaseReducerModel.new('key', [1, 2, 3])
50
+ reducer = BaseReducer.new(@script, model)
51
+ model.aggregate
52
+
53
+ reducer.emitted.should == [{'key' => 6}]
54
+ end
55
+
56
+ it 'can emit as identity' do
57
+ model = BaseReducerModel.new('key', [1, 2, 3])
58
+ reducer = BaseReducer.new(@script, model)
59
+ model.identity
60
+
61
+ reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}]
62
+ end
63
+ end
64
+
65
+ describe BaseSetup do
66
+ it 'can get paths' do
67
+ setup = BaseSetup.new(@script, nil)
68
+ setup.run
69
+ setup.paths[0].should == 'test/inputs'
70
+ setup.paths[1].should == 'test/outputs'
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,82 @@
1
+ require 'log_analysis'
2
+ require 'word_count'
3
+
4
+ include HadoopDsl::LogAnalysis
5
+ describe 'Aapach Log Example' do
6
+ before(:all) do
7
+ @script = File.join(File.dirname(__FILE__), '..', 'examples', 'apachelog-v2.rb')
8
+ @value = '127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326'
9
+ end
10
+
11
+ it 'can run example by mapper' do
12
+ mapper = LogAnalysisMapper.new(@script, nil, @value)
13
+ mapper.run
14
+ mapper.emitted.first["user\tfrank"].should == 1
15
+ end
16
+
17
+ it 'can run example by reducer' do
18
+ reducer = LogAnalysisReducer.new(@script, "user\tfrank", [1, 1, 1])
19
+ reducer.run
20
+ reducer.emitted.first["user\tfrank"].should == 3
21
+ end
22
+ end
23
+
24
+ include HadoopDsl::WordCount
25
+ describe 'Word Count Example' do
26
+ before(:all) do
27
+ @script = File.join(File.dirname(__FILE__), '..', 'examples', 'word_count_test.rb')
28
+ @value = 'Lorem ipsum ipsum Lorem sit amet,'
29
+ end
30
+
31
+ it 'can run example by mapper' do
32
+ mapper = WordCountMapper.new(@script, nil, @value)
33
+ mapper.run
34
+ mapper.emitted.size.should == 9
35
+ mapper.emitted.each do |e|
36
+ case e.keys.first
37
+ when 'Lorem'
38
+ e.values.first.should == 1
39
+ when 'total words'
40
+ e.values.first.should == 6
41
+ end
42
+ end
43
+ end
44
+
45
+ it 'can run example by reducer' do
46
+ reducer = WordCountReducer.new(@script, "Lorem", [1, 1, 1])
47
+ reducer.run
48
+ reducer.emitted.first["Lorem"].should == 3
49
+ end
50
+ end
51
+
52
+ include HadoopDsl::HiveLike
53
+ describe 'Hive Like Example' do
54
+ before(:all) do
55
+ @script = File.join(File.dirname(__FILE__), '..', 'examples', 'hive_like_test.rb')
56
+ @value = 'apple, 3, 100'
57
+ end
58
+
59
+ it 'can run setup' do
60
+ conf = mock('conf')
61
+ conf.should_receive(:output_key_class=).once
62
+ conf.should_receive(:output_value_class=).once
63
+
64
+ setup = HiveLikeSetup.new(@script, conf)
65
+ setup.run
66
+ setup.paths[0].should == 'hive-like/items.txt'
67
+ end
68
+
69
+ it 'can run example by mapper' do
70
+ mapper = HiveLikeMapper.new(@script, nil, @value)
71
+ mapper.run
72
+ mapper.emitted.size.should == 1
73
+ mapper.emitted.first['items'].should == '3, 100, apple'
74
+ end
75
+
76
+ it 'can run example by reducer' do
77
+ values = ['v1', 'v2', 'v3']
78
+ reducer = HiveLikeReducer.new(@script, "items", values)
79
+ reducer.run
80
+ reducer.emitted.first["items"].should == 'v1'
81
+ end
82
+ end
@@ -0,0 +1,58 @@
1
+ require 'init'
2
+ require 'core'
3
+ require 'hive_like'
4
+
5
+ include HadoopDsl::HiveLike
6
+
7
+ describe HiveLikeSetup do
8
+ it 'should load data' do
9
+ script = create_tmp_script(%Q!load_data "hive-like/inputs", items;!)
10
+ conf = mock('conf')
11
+ conf.should_receive(:output_key_class=).once
12
+ conf.should_receive(:output_value_class=).once
13
+
14
+ setup = HiveLikeSetup.new(script, conf)
15
+ setup.run
16
+ setup.paths[0].should == 'hive-like/inputs'
17
+ setup.paths[1].should == 'hive-like/outputs'
18
+ end
19
+ end
20
+
21
+ describe HiveLikeMapper do
22
+ before do
23
+ @value = 'apple, 3, 100'
24
+ end
25
+
26
+ it 'should create table' do
27
+ mapper = HiveLikeMapper.new(nil, nil, @value)
28
+ mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
29
+ mapper.table.name.should == 'items'
30
+ mapper.table.column(0).should == 'item'
31
+ mapper.table.column(1).should == 'quantity'
32
+ end
33
+
34
+ it 'should select' do
35
+ mapper = HiveLikeMapper.new(nil, nil, @value)
36
+ mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
37
+ mapper.select("item", "quantity", "price", "from", "items")
38
+ mapper.emitted.first.should == {'items' => 'apple, 3, 100'}
39
+ end
40
+
41
+ it 'should pre process script body' do
42
+ body = "select foo, bar from table;\n"
43
+ mapper = HiveLikeMapper.new(nil, nil, @value)
44
+ processed = mapper.pre_process(body)
45
+ processed.should == %Q!select("foo", "bar", "from", "table")\n!
46
+ end
47
+ end
48
+
49
+ describe HiveLikeReducer do
50
+ it 'should select as identity' do
51
+ key = 'Lorem'
52
+ values = [1, 1, 1]
53
+ reducer = HiveLikeReducer.new(nil, key, values)
54
+
55
+ reducer.select
56
+ reducer.emitted[0].should == {'Lorem' => 1}
57
+ end
58
+ end