hadoop-rubydsl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/hive_like.rb ADDED
@@ -0,0 +1,122 @@
1
+ require 'core'
2
+ require 'enumerator'
3
+
4
+ module HadoopDsl::HiveLike
5
+ include HadoopDsl
6
+
7
+ AVAILABLE_METHODS = [:select, :create_table, :table]
8
+
9
+ # common
10
+ module HiveLikeMapRed
11
+ def pre_process(body)
12
+ processed = ""
13
+ body.each do |line|
14
+ next if line =~ /^#/
15
+ if line =~ /^(\w*)\s+(.*);$/
16
+ method = $1
17
+ args = sprit_and_marge_args($2)
18
+ processed << "#{method}(#{args})\n"
19
+ else
20
+ processed << line + "\n"
21
+ end
22
+ end
23
+ processed
24
+ end
25
+
26
+ def sprit_and_marge_args(raw)
27
+ raw.gsub(/[\(\)]/, ' ').split.map do |s|
28
+ stripped = s.gsub(/[\s,"']/, '')
29
+ %Q!"#{stripped}"!
30
+ end.join(", ")
31
+ end
32
+ end
33
+
34
+ # controller
35
+ class HiveLikeSetup < BaseSetup
36
+ def load_data(inputs, table)
37
+ @from = inputs
38
+ @to = inputs.gsub(/#{File.basename(inputs)}$/, 'outputs')
39
+ end
40
+
41
+ def output_format
42
+ @conf.output_key_class = Text
43
+ @conf.output_value_class = Text
44
+ end
45
+
46
+ # might not need but occur error if not exists
47
+ def select(*args) end
48
+
49
+ include HiveLikeMapRed
50
+ end
51
+
52
+ class HiveLikeMapper < BaseMapper
53
+ def initialize(script, key, value)
54
+ super(script, HiveLikeMapperModel.new(key, value))
55
+ end
56
+
57
+ include HiveLikeMapRed
58
+
59
+ # model methods
60
+ def_delegators :@model, *AVAILABLE_METHODS
61
+ end
62
+
63
+ class HiveLikeReducer < BaseReducer
64
+ def initialize(script, key, values)
65
+ super(script, HiveLikeReducerModel.new(key, values))
66
+ end
67
+
68
+ include HiveLikeMapRed
69
+
70
+ # model methods
71
+ def_delegators :@model, *AVAILABLE_METHODS
72
+ end
73
+
74
+ # model
75
+ class HiveLikeMapperModel < BaseMapperModel
76
+ attr_reader :table
77
+
78
+ def initialize(key, value)
79
+ super(key, value)
80
+ end
81
+
82
+ # emitters
83
+ def create_table(name, *column_and_type)
84
+ @table = Table.new(name)
85
+ column_and_type.each_with_index do |column, index|
86
+ next if index % 2 != 0 # type
87
+ @table.columns << column_and_type[index]
88
+ end
89
+ end
90
+
91
+ def select(*args)
92
+ from_index = args.index('from')
93
+ if from_index
94
+ values = args[0...from_index].map do |column|
95
+ splitted = @value.split(/[,\s]+/)
96
+ splitted[@table.columns.index(column)]
97
+ end
98
+ @controller.emit(args[from_index + 1] => values.join(", "))
99
+ end
100
+ end
101
+
102
+ class Table
103
+ attr_reader :name, :columns
104
+
105
+ def initialize(name)
106
+ @name = name
107
+ @columns = []
108
+ end
109
+
110
+ def column(index) @columns[index] end
111
+ end
112
+ end
113
+
114
+ class HiveLikeReducerModel < BaseReducerModel
115
+ def initialize(key, values)
116
+ super(key, values)
117
+ end
118
+
119
+ # emitters
120
+ def select(*args) identity end
121
+ end
122
+ end
data/lib/init.rb ADDED
@@ -0,0 +1,60 @@
1
+ require 'core'
2
+ require 'java'
3
+ require 'mapred_factory'
4
+
5
+ import 'org.apache.hadoop.io.IntWritable'
6
+ import 'org.apache.hadoop.io.Text'
7
+
8
+ include HadoopDsl
9
+
10
+ # Hadoop IO types
11
+ HadoopDsl::Text = Text
12
+ HadoopDsl::IntWritable = IntWritable
13
+
14
+ def map(key, value, output, reporter, script)
15
+ mapper = MapperFactory.create(script, key.to_string, value.to_string)
16
+ mapper.run
17
+
18
+ write(output, mapper)
19
+ end
20
+
21
+ def reduce(key, values, output, reporter, script)
22
+ ruby_values = values.map {|v| to_ruby(v)}
23
+ reducer = ReducerFactory.create(script, key.to_string, ruby_values)
24
+ reducer.run
25
+
26
+ write(output, reducer)
27
+ end
28
+
29
+ def setup(conf, script)
30
+ setup = SetupFactory.create(script, conf)
31
+ setup.run
32
+
33
+ setup.paths.to_java
34
+ end
35
+
36
+ private
37
+
38
+ def write(output, controller)
39
+ controller.emitted.each do |e|
40
+ e.each do |k, v|
41
+ output.collect(to_hadoop(k), to_hadoop(v))
42
+ end
43
+ end
44
+ end
45
+
46
+ def to_ruby(value)
47
+ case value
48
+ when IntWritable then value.get
49
+ when Text then value.to_string
50
+ else raise "no match class: #{value.class}"
51
+ end
52
+ end
53
+
54
+ def to_hadoop(value)
55
+ case value
56
+ when Integer then IntWritable.new(value)
57
+ when String then t = Text.new; t.set(value); t
58
+ else raise "no match class: #{value.class}"
59
+ end
60
+ end
@@ -0,0 +1 @@
1
+ jruby-complete-*.jar
Binary file
@@ -0,0 +1,165 @@
1
+ require 'core'
2
+ require 'enumerator'
3
+
4
+ module HadoopDsl::LogAnalysis
5
+ include HadoopDsl
6
+
7
+ KEY_SEP = "\t"
8
+ PREFIX = 'col'
9
+ PASS = nil
10
+ AVAILABLE_METHODS = [:separate, :pattern, :column_name, :column, :topic, :value, :count_uniq, :sum]
11
+
12
+ # common
13
+ module LogAnalysisMapRed
14
+ # entry point
15
+ def data(description = '', &block) yield end
16
+
17
+ def each_line(&block) yield end
18
+ end
19
+
20
+ # controller
21
+ class LogAnalysisSetup < BaseSetup
22
+ def initialize(script, conf)
23
+ super(script, conf)
24
+ end
25
+
26
+ include LogAnalysisMapRed
27
+ end
28
+
29
+ class LogAnalysisMapper < BaseMapper
30
+ def initialize(script, key, value)
31
+ super(script, LogAnalysisMapperModel.new(key, value))
32
+ end
33
+
34
+ include LogAnalysisMapRed
35
+
36
+ # model methods
37
+ def_delegators :@model, *AVAILABLE_METHODS
38
+ end
39
+
40
+ class LogAnalysisReducer < BaseReducer
41
+ def initialize(script, key, values)
42
+ super(script, LogAnalysisReducerModel.new(key, values))
43
+ end
44
+
45
+ include LogAnalysisMapRed
46
+
47
+ # model methods
48
+ def_delegators :@model, *AVAILABLE_METHODS
49
+ end
50
+
51
+ # model
52
+ class LogAnalysisMapperModel < BaseMapperModel
53
+ def initialize(key, value)
54
+ super(key, value)
55
+ @columns = ColumnArray.new
56
+ @topics = []
57
+ end
58
+
59
+ def column; @columns end
60
+
61
+ def topic(desc, options = {}, &block)
62
+ @topics << @current_topic = Topic.new(desc, options[:label])
63
+ yield if block_given?
64
+ @current_topic
65
+ end
66
+
67
+ def separate(sep)
68
+ parts = @value.split(sep)
69
+ create_or_replace_columns_with(parts) {|column, value| column.value = value}
70
+ end
71
+
72
+ def pattern(re)
73
+ if @value =~ re
74
+ md = Regexp.last_match
75
+ create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
76
+ end
77
+ end
78
+
79
+ # column names by String converted to Symbol
80
+ def column_name(*names)
81
+ sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
82
+ create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
83
+ end
84
+
85
+ def create_or_replace_columns_with(array, &block)
86
+ columns = array.enum_for(:each_with_index).map do |p, i|
87
+ c = @columns[i] ? @columns[i] : Column.new(i)
88
+ yield c, p
89
+ c
90
+ end
91
+ @columns = ColumnArray.new(columns)
92
+ end
93
+
94
+ # emitters
95
+ def count_uniq(column)
96
+ @controller.emit([@current_topic.label, KEY_SEP, column.value].join => 1)
97
+ end
98
+
99
+ def sum(column)
100
+ @controller.emit([@current_topic.label].join => column.value.to_i)
101
+ end
102
+
103
+ class ColumnArray < Array
104
+ def [](key)
105
+ case key
106
+ when Integer then at(key)
107
+ when Symbol then (select {|c| c.name == key}).first
108
+ when String then (select {|c| c.name == key.to_sym}).first
109
+ end
110
+ end
111
+ end
112
+
113
+ class Column
114
+ attr_reader :index
115
+ attr_accessor :value, :name
116
+
117
+ def initialize(index, value = nil)
118
+ @index, @value = index, value
119
+ end
120
+ end
121
+
122
+ class Topic
123
+ def initialize(desc, label = nil)
124
+ @desc, @label = desc, label
125
+ end
126
+
127
+ def label
128
+ @label || @desc.gsub(/\s/, '_')
129
+ end
130
+ end
131
+ end
132
+
133
+ class LogAnalysisReducerModel < BaseReducerModel
134
+ def initialize(key, values)
135
+ super(key, values)
136
+ if key =~ /(\w*)#{KEY_SEP}?(.*)/
137
+ @topic = Topic.new($1, values)
138
+ end
139
+ end
140
+
141
+ def topic(desc, options = {}, &block)
142
+ @current_topic = Topic.new(options[:label] || desc.gsub(/\s/, '_'), nil)
143
+ yield if block_given?
144
+ @current_topic
145
+ end
146
+
147
+ def count_uniq(column)
148
+ aggregate if @topic == @current_topic
149
+ end
150
+
151
+ def sum(column)
152
+ aggregate if @topic == @current_topic
153
+ end
154
+
155
+ class Topic
156
+ attr_reader :label, :values
157
+
158
+ def initialize(label, values)
159
+ @label, @values = label, values
160
+ end
161
+
162
+ def ==(rh) self.label == rh.label end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,43 @@
1
+ require 'util'
2
+
3
+ module HadoopDsl
4
+ class MapRedFactory
5
+ def self.dsl_name(script)
6
+ read_file(script).each_line do |line|
7
+ dsl_name = $1 if line =~ /^use\s*'(\w*)'/
8
+ return dsl_name
9
+ end
10
+ end
11
+
12
+ def self.require_dsl_lib(dsl_name)
13
+ require snake_case(dsl_name)
14
+ end
15
+ end
16
+
17
+ class MapperFactory < MapRedFactory
18
+ def self.create(script, key, value)
19
+ dsl_name = self.dsl_name(script)
20
+ require_dsl_lib(dsl_name)
21
+ mapper_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Mapper"
22
+ return eval(mapper_class).new(script, key, value)
23
+ end
24
+ end
25
+
26
+ class ReducerFactory < MapRedFactory
27
+ def self.create(script, key, values)
28
+ dsl_name = self.dsl_name(script)
29
+ require_dsl_lib(dsl_name)
30
+ reducer_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Reducer"
31
+ return eval(reducer_class).new(script, key, values)
32
+ end
33
+ end
34
+
35
+ class SetupFactory < MapRedFactory
36
+ def self.create(script, conf)
37
+ dsl_name = self.dsl_name(script)
38
+ require_dsl_lib(dsl_name)
39
+ setup_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Setup"
40
+ eval(setup_class).new(script, conf) rescue HadoopDsl::BaseSetup.new(script, conf)
41
+ end
42
+ end
43
+ end
data/lib/util.rb ADDED
@@ -0,0 +1,11 @@
1
+ # utility functions
2
+
3
+ module HadoopDsl
4
+ def snake_case(str)
5
+ str.gsub(/\B[A-Z]/, '_\&').downcase
6
+ end
7
+
8
+ def read_file(file_name)
9
+ File.open(file_name).read
10
+ end
11
+ end
data/lib/word_count.rb ADDED
@@ -0,0 +1,76 @@
1
+ require 'core'
2
+ require 'enumerator'
3
+
4
+ module HadoopDsl::WordCount
5
+ include HadoopDsl
6
+
7
+ AVAILABLE_METHODS = [:count_uniq, :total]
8
+ TOTAL_PREFIX = "\t"
9
+
10
+ # common
11
+ module WordCountMapRed
12
+ # entry point
13
+ def data(description = '', &block) yield end
14
+ end
15
+
16
+ # controller
17
+ class WordCountMapper < BaseMapper
18
+ def initialize(script, key, value)
19
+ super(script, WordCountMapperModel.new(key, value))
20
+ end
21
+
22
+ include WordCountMapRed
23
+
24
+ # model methods
25
+ def_delegators :@model, *AVAILABLE_METHODS
26
+ end
27
+
28
+ class WordCountReducer < BaseReducer
29
+ def initialize(script, key, values)
30
+ super(script, WordCountReducerModel.new(key, values))
31
+ end
32
+
33
+ include WordCountMapRed
34
+
35
+ # model methods
36
+ def_delegators :@model, *AVAILABLE_METHODS
37
+ end
38
+
39
+ # model
40
+ class WordCountMapperModel < BaseMapperModel
41
+ def initialize(key, value)
42
+ super(key, value)
43
+ end
44
+
45
+ # emitters
46
+ def count_uniq
47
+ @value.split.each {|word| @controller.emit(word => 1)}
48
+ end
49
+
50
+ def total(*types)
51
+ types.each do |type|
52
+ case type
53
+ when :bytes
54
+ @controller.emit("#{TOTAL_PREFIX}total bytes" => @value.gsub(/\s/, '').length)
55
+ when :words
56
+ @controller.emit("#{TOTAL_PREFIX}total words" => @value.split.size)
57
+ when :lines
58
+ @controller.emit("#{TOTAL_PREFIX}total lines" => 1)
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ class WordCountReducerModel < BaseReducerModel
65
+ def initialize(key, values)
66
+ super(key, values)
67
+ end
68
+
69
+ # emitters
70
+ def count_uniq; aggregate unless total_value? end
71
+ def total(*types); aggregate if total_value? end
72
+
73
+ private
74
+ def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
75
+ end
76
+ end
data/spec/core_spec.rb ADDED
@@ -0,0 +1,73 @@
1
+ require 'init'
2
+ require 'core'
3
+
4
+ include HadoopDsl
5
+
6
+ describe 'BaseMapRed' do
7
+ before(:all) do
8
+ @script = create_tmp_script(<<-EOF)
9
+ from 'test/inputs'
10
+ to 'test/outputs'
11
+ EOF
12
+ end
13
+
14
+ it 'emit key value' do
15
+ mapper = BaseMapper.new(@script, BaseMapperModel.new(nil, nil))
16
+ mapper.emit('key' => 'value')
17
+ mapper.emitted.should == [{'key' => 'value'}]
18
+ end
19
+
20
+ it 'can run BaseMapper in minimum' do
21
+ model = BaseMapperModel.new('key', 'value')
22
+ mapper = BaseMapper.new(@script, model)
23
+ mapper.run
24
+ end
25
+
26
+ it 'can run BaseReducer in minimum' do
27
+ model = BaseReducerModel.new('key', 'values')
28
+ reducer = BaseReducer.new(@script, model)
29
+ reducer.run
30
+ end
31
+
32
+ it 'can run BaseSetup in minimum' do
33
+ setup = BaseSetup.new(@script, nil)
34
+ setup.run
35
+ end
36
+
37
+ describe BaseMapper do
38
+ it 'can emit as identity' do
39
+ model = BaseMapperModel.new('key', 'value')
40
+ mapper = BaseMapper.new(@script, model)
41
+ model.identity
42
+
43
+ mapper.emitted.should == [{'key' => 'value'}]
44
+ end
45
+ end
46
+
47
+ describe BaseReducer do
48
+ it 'can emit as aggregate' do
49
+ model = BaseReducerModel.new('key', [1, 2, 3])
50
+ reducer = BaseReducer.new(@script, model)
51
+ model.aggregate
52
+
53
+ reducer.emitted.should == [{'key' => 6}]
54
+ end
55
+
56
+ it 'can emit as identity' do
57
+ model = BaseReducerModel.new('key', [1, 2, 3])
58
+ reducer = BaseReducer.new(@script, model)
59
+ model.identity
60
+
61
+ reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}]
62
+ end
63
+ end
64
+
65
+ describe BaseSetup do
66
+ it 'can get paths' do
67
+ setup = BaseSetup.new(@script, nil)
68
+ setup.run
69
+ setup.paths[0].should == 'test/inputs'
70
+ setup.paths[1].should == 'test/outputs'
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,82 @@
1
+ require 'log_analysis'
2
+ require 'word_count'
3
+
4
+ include HadoopDsl::LogAnalysis
5
+ describe 'Aapach Log Example' do
6
+ before(:all) do
7
+ @script = File.join(File.dirname(__FILE__), '..', 'examples', 'apachelog-v2.rb')
8
+ @value = '127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326'
9
+ end
10
+
11
+ it 'can run example by mapper' do
12
+ mapper = LogAnalysisMapper.new(@script, nil, @value)
13
+ mapper.run
14
+ mapper.emitted.first["user\tfrank"].should == 1
15
+ end
16
+
17
+ it 'can run example by reducer' do
18
+ reducer = LogAnalysisReducer.new(@script, "user\tfrank", [1, 1, 1])
19
+ reducer.run
20
+ reducer.emitted.first["user\tfrank"].should == 3
21
+ end
22
+ end
23
+
24
+ include HadoopDsl::WordCount
25
+ describe 'Word Count Example' do
26
+ before(:all) do
27
+ @script = File.join(File.dirname(__FILE__), '..', 'examples', 'word_count_test.rb')
28
+ @value = 'Lorem ipsum ipsum Lorem sit amet,'
29
+ end
30
+
31
+ it 'can run example by mapper' do
32
+ mapper = WordCountMapper.new(@script, nil, @value)
33
+ mapper.run
34
+ mapper.emitted.size.should == 9
35
+ mapper.emitted.each do |e|
36
+ case e.keys.first
37
+ when 'Lorem'
38
+ e.values.first.should == 1
39
+ when 'total words'
40
+ e.values.first.should == 6
41
+ end
42
+ end
43
+ end
44
+
45
+ it 'can run example by reducer' do
46
+ reducer = WordCountReducer.new(@script, "Lorem", [1, 1, 1])
47
+ reducer.run
48
+ reducer.emitted.first["Lorem"].should == 3
49
+ end
50
+ end
51
+
52
+ include HadoopDsl::HiveLike
53
+ describe 'Hive Like Example' do
54
+ before(:all) do
55
+ @script = File.join(File.dirname(__FILE__), '..', 'examples', 'hive_like_test.rb')
56
+ @value = 'apple, 3, 100'
57
+ end
58
+
59
+ it 'can run setup' do
60
+ conf = mock('conf')
61
+ conf.should_receive(:output_key_class=).once
62
+ conf.should_receive(:output_value_class=).once
63
+
64
+ setup = HiveLikeSetup.new(@script, conf)
65
+ setup.run
66
+ setup.paths[0].should == 'hive-like/items.txt'
67
+ end
68
+
69
+ it 'can run example by mapper' do
70
+ mapper = HiveLikeMapper.new(@script, nil, @value)
71
+ mapper.run
72
+ mapper.emitted.size.should == 1
73
+ mapper.emitted.first['items'].should == '3, 100, apple'
74
+ end
75
+
76
+ it 'can run example by reducer' do
77
+ values = ['v1', 'v2', 'v3']
78
+ reducer = HiveLikeReducer.new(@script, "items", values)
79
+ reducer.run
80
+ reducer.emitted.first["items"].should == 'v1'
81
+ end
82
+ end
@@ -0,0 +1,58 @@
1
+ require 'init'
2
+ require 'core'
3
+ require 'hive_like'
4
+
5
+ include HadoopDsl::HiveLike
6
+
7
+ describe HiveLikeSetup do
8
+ it 'should load data' do
9
+ script = create_tmp_script(%Q!load_data "hive-like/inputs", items;!)
10
+ conf = mock('conf')
11
+ conf.should_receive(:output_key_class=).once
12
+ conf.should_receive(:output_value_class=).once
13
+
14
+ setup = HiveLikeSetup.new(script, conf)
15
+ setup.run
16
+ setup.paths[0].should == 'hive-like/inputs'
17
+ setup.paths[1].should == 'hive-like/outputs'
18
+ end
19
+ end
20
+
21
+ describe HiveLikeMapper do
22
+ before do
23
+ @value = 'apple, 3, 100'
24
+ end
25
+
26
+ it 'should create table' do
27
+ mapper = HiveLikeMapper.new(nil, nil, @value)
28
+ mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
29
+ mapper.table.name.should == 'items'
30
+ mapper.table.column(0).should == 'item'
31
+ mapper.table.column(1).should == 'quantity'
32
+ end
33
+
34
+ it 'should select' do
35
+ mapper = HiveLikeMapper.new(nil, nil, @value)
36
+ mapper.create_table('items', 'item', 'STRING', 'quantity', 'INT', 'price', 'INT');
37
+ mapper.select("item", "quantity", "price", "from", "items")
38
+ mapper.emitted.first.should == {'items' => 'apple, 3, 100'}
39
+ end
40
+
41
+ it 'should pre process script body' do
42
+ body = "select foo, bar from table;\n"
43
+ mapper = HiveLikeMapper.new(nil, nil, @value)
44
+ processed = mapper.pre_process(body)
45
+ processed.should == %Q!select("foo", "bar", "from", "table")\n!
46
+ end
47
+ end
48
+
49
+ describe HiveLikeReducer do
50
+ it 'should select as identity' do
51
+ key = 'Lorem'
52
+ values = [1, 1, 1]
53
+ reducer = HiveLikeReducer.new(nil, key, values)
54
+
55
+ reducer.select
56
+ reducer.emitted[0].should == {'Lorem' => 1}
57
+ end
58
+ end