hadoop-rubydsl 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/bin/hrd +1 -1
- data/examples/hive_like_test.rb +1 -1
- data/examples/{apachelog-v2-2.rb → log_analysis_test.rb} +24 -1
- data/examples/word_count_test.rb +1 -1
- data/hadoop-rubydsl.gemspec +8 -14
- data/lib/core.rb +46 -49
- data/lib/dsl_init.rb +1 -10
- data/lib/hadoop_dsl.rb +14 -0
- data/lib/{client.rb → hadoop_dsl_client.rb} +10 -0
- data/lib/hive_like.rb +24 -40
- data/lib/log_analysis.rb +109 -69
- data/lib/mapred_factory.rb +5 -5
- data/lib/util.rb +3 -3
- data/lib/word_count.rb +22 -42
- data/spec/client_spec.rb +2 -1
- data/spec/core_spec.rb +4 -4
- data/spec/dsl_init_spec.rb +1 -1
- data/spec/example_spec.rb +7 -5
- data/spec/hive_like_spec.rb +1 -1
- data/spec/log_analysis_spec.rb +60 -3
- data/spec/mapred_factory_spec.rb +30 -10
- data/spec/spec_helper.rb +0 -1
- data/spec/util_spec.rb +3 -4
- data/spec/word_count_spec.rb +1 -1
- metadata +6 -12
- data/TODO +0 -2
- data/examples/apachelog-v2.rb +0 -25
- data/examples/apachelog.rb +0 -15
- data/lib/hadoop-dsl.rb +0 -12
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.4
|
data/bin/hrd
CHANGED
data/examples/hive_like_test.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
dsl 'LogAnalysis'
|
2
2
|
|
3
3
|
data 'apache log on test2' do
|
4
4
|
from 'apachelog/inputs'
|
@@ -14,5 +14,28 @@ data 'apache log on test2' do
|
|
14
14
|
topic 'ua counts', :label => 'ua' do
|
15
15
|
count_uniq column[:ua]
|
16
16
|
end
|
17
|
+
|
18
|
+
topic 'count bot', :label => 'bot' do
|
19
|
+
ua = column[:ua].value
|
20
|
+
bot = ua if ua =~ /bot/i
|
21
|
+
count_uniq bot
|
22
|
+
end
|
23
|
+
|
24
|
+
topic 'ua counts group by path' do
|
25
|
+
request = column[:request].value
|
26
|
+
path = request.split(/\s+/)[1]
|
27
|
+
group_by path
|
28
|
+
count_uniq column[:ua]
|
29
|
+
end
|
30
|
+
|
31
|
+
topic 'ua counts by daily' do
|
32
|
+
group_date_by column[:access_date], :daily
|
33
|
+
count_uniq column[:ua]
|
34
|
+
end
|
35
|
+
|
36
|
+
# topic 'total bytes' do
|
37
|
+
# select_date column[:access_date], BY_MONTHLY
|
38
|
+
# sum column[:bytes].to_kilobytes # / 1024
|
39
|
+
# end
|
17
40
|
end
|
18
41
|
end
|
data/examples/word_count_test.rb
CHANGED
data/hadoop-rubydsl.gemspec
CHANGED
@@ -5,38 +5,34 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{hadoop-rubydsl}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Koichi Fujikawa"]
|
12
|
-
s.date = %q{2010-01-
|
12
|
+
s.date = %q{2010-01-13}
|
13
13
|
s.description = %q{Hadoop Ruby DSL}
|
14
14
|
s.email = %q{fujibee@gmail.com}
|
15
15
|
s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
|
16
16
|
s.extra_rdoc_files = [
|
17
|
-
"README.rdoc"
|
18
|
-
"TODO"
|
17
|
+
"README.rdoc"
|
19
18
|
]
|
20
19
|
s.files = [
|
21
20
|
".gitignore",
|
22
21
|
"README.rdoc",
|
23
22
|
"Rakefile",
|
24
|
-
"TODO",
|
25
23
|
"VERSION",
|
26
24
|
"bin/hadoop-hudson.sh",
|
27
25
|
"bin/hadoop-ruby.sh",
|
28
26
|
"bin/hrd",
|
29
27
|
"conf/hadoop-site.xml",
|
30
|
-
"examples/apachelog-v2-2.rb",
|
31
|
-
"examples/apachelog-v2.rb",
|
32
|
-
"examples/apachelog.rb",
|
33
28
|
"examples/hive_like_test.rb",
|
29
|
+
"examples/log_analysis_test.rb",
|
34
30
|
"examples/word_count_test.rb",
|
35
31
|
"hadoop-rubydsl.gemspec",
|
36
|
-
"lib/client.rb",
|
37
32
|
"lib/core.rb",
|
38
33
|
"lib/dsl_init.rb",
|
39
|
-
"lib/
|
34
|
+
"lib/hadoop_dsl.rb",
|
35
|
+
"lib/hadoop_dsl_client.rb",
|
40
36
|
"lib/hive_like.rb",
|
41
37
|
"lib/log_analysis.rb",
|
42
38
|
"lib/mapred_factory.rb",
|
@@ -59,11 +55,9 @@ Gem::Specification.new do |s|
|
|
59
55
|
"spec/hive_like_spec.rb",
|
60
56
|
"spec/log_analysis_spec.rb",
|
61
57
|
"spec/example_spec.rb",
|
62
|
-
"examples/apachelog-v2.rb",
|
63
58
|
"examples/hive_like_test.rb",
|
64
|
-
"examples/
|
65
|
-
"examples/
|
66
|
-
"examples/apachelog.rb"
|
59
|
+
"examples/log_analysis_test.rb",
|
60
|
+
"examples/word_count_test.rb"
|
67
61
|
]
|
68
62
|
|
69
63
|
if s.respond_to? :specification_version then
|
data/lib/core.rb
CHANGED
@@ -1,10 +1,33 @@
|
|
1
|
-
require '
|
1
|
+
require 'hadoop_dsl'
|
2
2
|
require 'forwardable'
|
3
3
|
|
4
4
|
module HadoopDsl
|
5
|
+
# common
|
6
|
+
module DslElement
|
7
|
+
# all DSL statements without def is processed here
|
8
|
+
def method_missing(method_name, *args)
|
9
|
+
yield if block_given?
|
10
|
+
self
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
5
14
|
# controller
|
15
|
+
module DslController
|
16
|
+
include DslElement
|
17
|
+
|
18
|
+
def run
|
19
|
+
body = pre_process(HadoopDsl.read_file(@script))
|
20
|
+
eval(body, binding, @script)
|
21
|
+
end
|
22
|
+
|
23
|
+
def pre_process(body)
|
24
|
+
body # do nothing
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
6
28
|
class BaseMapRed
|
7
29
|
extend Forwardable
|
30
|
+
include DslController
|
8
31
|
|
9
32
|
attr_reader :emitted
|
10
33
|
|
@@ -14,66 +37,54 @@ module HadoopDsl
|
|
14
37
|
@emitted = []
|
15
38
|
end
|
16
39
|
|
17
|
-
def run
|
18
|
-
body = pre_process(read_file(@script))
|
19
|
-
eval(body, binding, @script)
|
20
|
-
end
|
21
|
-
|
22
|
-
def pre_process(body)
|
23
|
-
body # do nothing
|
24
|
-
end
|
25
|
-
|
26
40
|
def emit(hash) @emitted << hash end
|
27
41
|
|
28
|
-
|
29
|
-
def
|
42
|
+
private
|
43
|
+
def key; @model.key end
|
30
44
|
end
|
31
45
|
|
32
46
|
class BaseSetup
|
47
|
+
include DslController
|
48
|
+
|
33
49
|
def initialize(script, conf)
|
34
50
|
@script, @conf = script, conf
|
35
51
|
output_format
|
36
52
|
end
|
37
53
|
|
38
|
-
def
|
39
|
-
body = pre_process(read_file(@script))
|
40
|
-
eval(body, binding, @script)
|
41
|
-
end
|
42
|
-
|
43
|
-
def pre_process(body)
|
44
|
-
body # do nothing
|
45
|
-
end
|
46
|
-
|
47
|
-
# do nothing
|
48
|
-
def output_format; end
|
49
|
-
|
54
|
+
def output_format; end # do nothing
|
50
55
|
def paths; [@from, @to] end
|
51
|
-
|
52
56
|
def from(path) @from = path end
|
53
57
|
def to(path) @to = path end
|
54
|
-
|
55
|
-
# all DSL statements without def is processed here
|
56
|
-
def method_missing(method_name, *args) self end
|
57
58
|
end
|
58
59
|
|
59
60
|
class BaseMapper < BaseMapRed
|
60
|
-
|
61
|
-
|
61
|
+
# common functions
|
62
|
+
def identity
|
63
|
+
emit(@model.key => @model.value)
|
62
64
|
end
|
65
|
+
|
66
|
+
private
|
67
|
+
def value; @model.values end
|
63
68
|
end
|
64
69
|
|
65
70
|
class BaseReducer < BaseMapRed
|
66
|
-
|
67
|
-
|
71
|
+
# common functions
|
72
|
+
def aggregate
|
73
|
+
emit(@model.key => @model.values.inject {|ret, i| ret + i})
|
74
|
+
end
|
75
|
+
|
76
|
+
def identity
|
77
|
+
@model.values.each {|v| emit(@model.key => v)}
|
68
78
|
end
|
79
|
+
|
80
|
+
private
|
81
|
+
def values; @model.values end
|
69
82
|
end
|
70
83
|
|
71
84
|
# model
|
72
85
|
class BaseModel
|
86
|
+
include DslElement
|
73
87
|
attr_accessor :controller
|
74
|
-
|
75
|
-
# all DSL statements without def is processed here
|
76
|
-
def method_missing(method_name, *args) self end
|
77
88
|
end
|
78
89
|
|
79
90
|
class BaseMapperModel < BaseModel
|
@@ -82,11 +93,6 @@ module HadoopDsl
|
|
82
93
|
def initialize(key, value)
|
83
94
|
@key, @value = key, value
|
84
95
|
end
|
85
|
-
|
86
|
-
# common functions
|
87
|
-
def identity
|
88
|
-
@controller.emit(@key => @value)
|
89
|
-
end
|
90
96
|
end
|
91
97
|
|
92
98
|
class BaseReducerModel < BaseModel
|
@@ -95,14 +101,5 @@ module HadoopDsl
|
|
95
101
|
def initialize(key, values)
|
96
102
|
@key, @values = key, values
|
97
103
|
end
|
98
|
-
|
99
|
-
# common functions
|
100
|
-
def aggregate
|
101
|
-
@controller.emit(@key => @values.inject {|ret, i| ret + i})
|
102
|
-
end
|
103
|
-
|
104
|
-
def identity
|
105
|
-
@values.each {|v| @controller.emit(@key => v)}
|
106
|
-
end
|
107
104
|
end
|
108
105
|
end
|
data/lib/dsl_init.rb
CHANGED
@@ -1,16 +1,7 @@
|
|
1
|
-
require '
|
2
|
-
require 'java'
|
3
|
-
require 'mapred_factory'
|
4
|
-
|
5
|
-
import 'org.apache.hadoop.io.IntWritable'
|
6
|
-
import 'org.apache.hadoop.io.Text'
|
1
|
+
require 'hadoop_dsl'
|
7
2
|
|
8
3
|
include HadoopDsl
|
9
4
|
|
10
|
-
# Hadoop IO types
|
11
|
-
HadoopDsl::Text = Text
|
12
|
-
HadoopDsl::IntWritable = IntWritable
|
13
|
-
|
14
5
|
def map(key, value, output, reporter, script)
|
15
6
|
mapper = MapperFactory.create(script, key, value)
|
16
7
|
mapper.run
|
data/lib/hadoop_dsl.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'util'
|
2
|
+
require 'mapred_factory'
|
3
|
+
require 'core'
|
4
|
+
|
5
|
+
# for jruby
|
6
|
+
if defined? JRUBY_VERSION
|
7
|
+
require 'java'
|
8
|
+
import 'org.apache.hadoop.io.IntWritable'
|
9
|
+
import 'org.apache.hadoop.io.Text'
|
10
|
+
|
11
|
+
# Hadoop IO types
|
12
|
+
HadoopDsl::Text = Text
|
13
|
+
HadoopDsl::IntWritable = IntWritable
|
14
|
+
end
|
@@ -1,4 +1,14 @@
|
|
1
|
+
require 'jruby-on-hadoop'
|
2
|
+
|
1
3
|
module HadoopDsl
|
4
|
+
def self.lib_path
|
5
|
+
File.expand_path(File.dirname(__FILE__))
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.dsl_init_script
|
9
|
+
File.join(lib_path, "dsl_init.rb")
|
10
|
+
end
|
11
|
+
|
2
12
|
class Client < JRubyOnHadoop::Client
|
3
13
|
def parse_args
|
4
14
|
super
|
data/lib/hive_like.rb
CHANGED
@@ -1,11 +1,6 @@
|
|
1
|
-
require '
|
2
|
-
require 'enumerator'
|
1
|
+
require 'hadoop_dsl'
|
3
2
|
|
4
3
|
module HadoopDsl::HiveLike
|
5
|
-
include HadoopDsl
|
6
|
-
|
7
|
-
AVAILABLE_METHODS = [:select, :create_table, :table]
|
8
|
-
|
9
4
|
# common
|
10
5
|
module HiveLikeMapRed
|
11
6
|
def pre_process(body)
|
@@ -17,7 +12,7 @@ module HadoopDsl::HiveLike
|
|
17
12
|
args = sprit_and_marge_args($2)
|
18
13
|
processed << "#{method}(#{args})\n"
|
19
14
|
else
|
20
|
-
processed << line + "\n"
|
15
|
+
processed << line + "\n" if line
|
21
16
|
end
|
22
17
|
end
|
23
18
|
processed
|
@@ -32,15 +27,15 @@ module HadoopDsl::HiveLike
|
|
32
27
|
end
|
33
28
|
|
34
29
|
# controller
|
35
|
-
class HiveLikeSetup < BaseSetup
|
30
|
+
class HiveLikeSetup < HadoopDsl::BaseSetup
|
36
31
|
def load_data(inputs, table)
|
37
32
|
@from = inputs
|
38
33
|
@to = inputs.gsub(/#{File.basename(inputs)}$/, 'outputs')
|
39
34
|
end
|
40
35
|
|
41
36
|
def output_format
|
42
|
-
@conf.output_key_class = Text
|
43
|
-
@conf.output_value_class = Text
|
37
|
+
@conf.output_key_class = HadoopDsl::Text
|
38
|
+
@conf.output_value_class = HadoopDsl::Text
|
44
39
|
end
|
45
40
|
|
46
41
|
# might not need but occur error if not exists
|
@@ -49,37 +44,43 @@ module HadoopDsl::HiveLike
|
|
49
44
|
include HiveLikeMapRed
|
50
45
|
end
|
51
46
|
|
52
|
-
class HiveLikeMapper < BaseMapper
|
47
|
+
class HiveLikeMapper < HadoopDsl::BaseMapper
|
53
48
|
def initialize(script, key, value)
|
54
49
|
super(script, HiveLikeMapperModel.new(key, value))
|
55
50
|
end
|
56
51
|
|
57
52
|
include HiveLikeMapRed
|
58
53
|
|
59
|
-
|
60
|
-
|
54
|
+
def_delegators :@model, :create_table, :table
|
55
|
+
|
56
|
+
# emitters
|
57
|
+
def select(*args)
|
58
|
+
from_index = args.index('from')
|
59
|
+
if from_index
|
60
|
+
values = args[0...from_index].map do |column|
|
61
|
+
splitted = @model.value.split(/[,\s]+/)
|
62
|
+
splitted[@model.table.columns.index(column)]
|
63
|
+
end
|
64
|
+
emit(args[from_index + 1] => values.join(", "))
|
65
|
+
end
|
66
|
+
end
|
61
67
|
end
|
62
68
|
|
63
|
-
class HiveLikeReducer < BaseReducer
|
69
|
+
class HiveLikeReducer < HadoopDsl::BaseReducer
|
64
70
|
def initialize(script, key, values)
|
65
71
|
super(script, HiveLikeReducerModel.new(key, values))
|
66
72
|
end
|
67
73
|
|
68
74
|
include HiveLikeMapRed
|
69
75
|
|
70
|
-
#
|
71
|
-
|
76
|
+
# emitters
|
77
|
+
def select(*args) identity end
|
72
78
|
end
|
73
79
|
|
74
80
|
# model
|
75
|
-
class HiveLikeMapperModel < BaseMapperModel
|
81
|
+
class HiveLikeMapperModel < HadoopDsl::BaseMapperModel
|
76
82
|
attr_reader :table
|
77
83
|
|
78
|
-
def initialize(key, value)
|
79
|
-
super(key, value)
|
80
|
-
end
|
81
|
-
|
82
|
-
# emitters
|
83
84
|
def create_table(name, *column_and_type)
|
84
85
|
@table = Table.new(name)
|
85
86
|
column_and_type.each_with_index do |column, index|
|
@@ -88,17 +89,6 @@ module HadoopDsl::HiveLike
|
|
88
89
|
end
|
89
90
|
end
|
90
91
|
|
91
|
-
def select(*args)
|
92
|
-
from_index = args.index('from')
|
93
|
-
if from_index
|
94
|
-
values = args[0...from_index].map do |column|
|
95
|
-
splitted = @value.split(/[,\s]+/)
|
96
|
-
splitted[@table.columns.index(column)]
|
97
|
-
end
|
98
|
-
@controller.emit(args[from_index + 1] => values.join(", "))
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
92
|
class Table
|
103
93
|
attr_reader :name, :columns
|
104
94
|
|
@@ -111,12 +101,6 @@ module HadoopDsl::HiveLike
|
|
111
101
|
end
|
112
102
|
end
|
113
103
|
|
114
|
-
class HiveLikeReducerModel < BaseReducerModel
|
115
|
-
def initialize(key, values)
|
116
|
-
super(key, values)
|
117
|
-
end
|
118
|
-
|
119
|
-
# emitters
|
120
|
-
def select(*args) identity end
|
104
|
+
class HiveLikeReducerModel < HadoopDsl::BaseReducerModel
|
121
105
|
end
|
122
106
|
end
|
data/lib/log_analysis.rb
CHANGED
@@ -1,55 +1,123 @@
|
|
1
|
-
require '
|
1
|
+
require 'hadoop_dsl'
|
2
2
|
require 'enumerator'
|
3
3
|
|
4
4
|
module HadoopDsl::LogAnalysis
|
5
|
-
include HadoopDsl
|
6
|
-
|
7
5
|
KEY_SEP = "\t"
|
8
6
|
PREFIX = 'col'
|
9
7
|
PASS = nil
|
10
|
-
|
8
|
+
MODEL_METHODS = [:column, :value]
|
11
9
|
|
12
|
-
#
|
13
|
-
|
14
|
-
|
15
|
-
|
10
|
+
# controller
|
11
|
+
class LogAnalysisMapper < HadoopDsl::BaseMapper
|
12
|
+
def initialize(script, key, value)
|
13
|
+
super(script, LogAnalysisMapperModel.new(key, value))
|
14
|
+
end
|
16
15
|
|
17
|
-
|
18
|
-
|
16
|
+
# model methods
|
17
|
+
def_delegators :@model, *MODEL_METHODS
|
18
|
+
|
19
|
+
def topic(desc, options = {}, &block)
|
20
|
+
@model.create_topic(desc, options)
|
21
|
+
yield if block_given?
|
22
|
+
current_topic
|
23
|
+
end
|
19
24
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
super(script, conf)
|
25
|
+
def separate(sep)
|
26
|
+
parts = value.split(sep)
|
27
|
+
@model.create_or_replace_columns_with(parts) {|column, value| column.value = value}
|
24
28
|
end
|
25
29
|
|
26
|
-
|
27
|
-
|
30
|
+
def pattern(re)
|
31
|
+
if value =~ re
|
32
|
+
md = Regexp.last_match
|
33
|
+
@model.create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
|
34
|
+
end
|
35
|
+
end
|
28
36
|
|
29
|
-
|
30
|
-
def
|
31
|
-
|
37
|
+
# column names by String converted to Symbol
|
38
|
+
def column_name(*names)
|
39
|
+
sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
|
40
|
+
@model.create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
|
32
41
|
end
|
33
42
|
|
34
|
-
|
43
|
+
def group_by(column_or_value)
|
44
|
+
case column_or_value
|
45
|
+
when LogAnalysisMapperModel::Column
|
46
|
+
column = column_or_value
|
47
|
+
current_topic.key_elements << column.value
|
48
|
+
else
|
49
|
+
value = column_or_value
|
50
|
+
current_topic.key_elements << value
|
51
|
+
end
|
52
|
+
end
|
35
53
|
|
36
|
-
|
37
|
-
|
54
|
+
def group_date_by(column, term)
|
55
|
+
require 'time'
|
56
|
+
time = parse_time(column.value)
|
57
|
+
time_key = case term
|
58
|
+
when :daily then time.strftime('%Y%m%d')
|
59
|
+
when :monthly then time.strftime('%Y%m')
|
60
|
+
when :yearly then time.strftime('%Y')
|
61
|
+
end
|
62
|
+
current_topic.key_elements << time_key
|
63
|
+
end
|
64
|
+
|
65
|
+
# emitters
|
66
|
+
def count_uniq(column_or_value)
|
67
|
+
uniq_key =
|
68
|
+
case column_or_value
|
69
|
+
when LogAnalysisMapperModel::Column
|
70
|
+
column = column_or_value
|
71
|
+
column.value
|
72
|
+
else column_or_value # value
|
73
|
+
end
|
74
|
+
current_topic.key_elements << uniq_key
|
75
|
+
emit(current_topic.key => 1)
|
76
|
+
end
|
77
|
+
|
78
|
+
def sum(column)
|
79
|
+
emit(current_topic.key => column.value.to_i)
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
def current_topic; @model.current_topic end
|
84
|
+
|
85
|
+
def parse_time(str)
|
86
|
+
begin Time.parse(str)
|
87
|
+
rescue
|
88
|
+
# apachelog pattern ex) "10/Oct/2000:13:55:36 -0700"
|
89
|
+
Time.parse($1) if str =~ /^(\d*\/\w*\/\d*):/
|
90
|
+
end
|
91
|
+
end
|
38
92
|
end
|
39
93
|
|
40
|
-
class LogAnalysisReducer < BaseReducer
|
94
|
+
class LogAnalysisReducer < HadoopDsl::BaseReducer
|
41
95
|
def initialize(script, key, values)
|
42
96
|
super(script, LogAnalysisReducerModel.new(key, values))
|
43
97
|
end
|
44
98
|
|
45
|
-
include LogAnalysisMapRed
|
46
|
-
|
47
99
|
# model methods
|
48
|
-
def_delegators :@model, *
|
100
|
+
def_delegators :@model, *MODEL_METHODS
|
101
|
+
|
102
|
+
def topic(desc, options = {}, &block)
|
103
|
+
@model.create_topic(desc, options)
|
104
|
+
yield if block_given?
|
105
|
+
@model.current_topic
|
106
|
+
end
|
107
|
+
|
108
|
+
def count_uniq(column)
|
109
|
+
aggregate if @model.topic == @model.current_topic
|
110
|
+
end
|
111
|
+
|
112
|
+
def sum(column)
|
113
|
+
aggregate if @model.topic == @model.current_topic
|
114
|
+
end
|
49
115
|
end
|
50
116
|
|
51
117
|
# model
|
52
|
-
class LogAnalysisMapperModel < BaseMapperModel
|
118
|
+
class LogAnalysisMapperModel < HadoopDsl::BaseMapperModel
|
119
|
+
attr_reader :current_topic
|
120
|
+
|
53
121
|
def initialize(key, value)
|
54
122
|
super(key, value)
|
55
123
|
@columns = ColumnArray.new
|
@@ -58,28 +126,8 @@ module HadoopDsl::LogAnalysis
|
|
58
126
|
|
59
127
|
def column; @columns end
|
60
128
|
|
61
|
-
def
|
129
|
+
def create_topic(desc, options)
|
62
130
|
@topics << @current_topic = Topic.new(desc, options[:label])
|
63
|
-
yield if block_given?
|
64
|
-
@current_topic
|
65
|
-
end
|
66
|
-
|
67
|
-
def separate(sep)
|
68
|
-
parts = @value.split(sep)
|
69
|
-
create_or_replace_columns_with(parts) {|column, value| column.value = value}
|
70
|
-
end
|
71
|
-
|
72
|
-
def pattern(re)
|
73
|
-
if @value =~ re
|
74
|
-
md = Regexp.last_match
|
75
|
-
create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
# column names by String converted to Symbol
|
80
|
-
def column_name(*names)
|
81
|
-
sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
|
82
|
-
create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
|
83
131
|
end
|
84
132
|
|
85
133
|
def create_or_replace_columns_with(array, &block)
|
@@ -91,15 +139,6 @@ module HadoopDsl::LogAnalysis
|
|
91
139
|
@columns = ColumnArray.new(columns)
|
92
140
|
end
|
93
141
|
|
94
|
-
# emitters
|
95
|
-
def count_uniq(column)
|
96
|
-
@controller.emit([@current_topic.label, KEY_SEP, column.value].join => 1)
|
97
|
-
end
|
98
|
-
|
99
|
-
def sum(column)
|
100
|
-
@controller.emit([@current_topic.label].join => column.value.to_i)
|
101
|
-
end
|
102
|
-
|
103
142
|
class ColumnArray < Array
|
104
143
|
def [](key)
|
105
144
|
case key
|
@@ -120,17 +159,28 @@ module HadoopDsl::LogAnalysis
|
|
120
159
|
end
|
121
160
|
|
122
161
|
class Topic
|
162
|
+
attr_reader :key_elements
|
163
|
+
|
123
164
|
def initialize(desc, label = nil)
|
124
165
|
@desc, @label = desc, label
|
166
|
+
@key_elements = []
|
125
167
|
end
|
126
168
|
|
127
169
|
def label
|
128
170
|
@label || @desc.gsub(/\s/, '_')
|
129
171
|
end
|
172
|
+
|
173
|
+
def key
|
174
|
+
without_label =
|
175
|
+
@key_elements.size > 0 ? @key_elements.join(KEY_SEP) : nil
|
176
|
+
[label, without_label].compact.join(KEY_SEP)
|
177
|
+
end
|
130
178
|
end
|
131
179
|
end
|
132
180
|
|
133
|
-
class LogAnalysisReducerModel < BaseReducerModel
|
181
|
+
class LogAnalysisReducerModel < HadoopDsl::BaseReducerModel
|
182
|
+
attr_reader :topic, :current_topic
|
183
|
+
|
134
184
|
def initialize(key, values)
|
135
185
|
super(key, values)
|
136
186
|
if key =~ /(\w*)#{KEY_SEP}?(.*)/
|
@@ -138,18 +188,8 @@ module HadoopDsl::LogAnalysis
|
|
138
188
|
end
|
139
189
|
end
|
140
190
|
|
141
|
-
def
|
191
|
+
def create_topic(desc, options)
|
142
192
|
@current_topic = Topic.new(options[:label] || desc.gsub(/\s/, '_'), nil)
|
143
|
-
yield if block_given?
|
144
|
-
@current_topic
|
145
|
-
end
|
146
|
-
|
147
|
-
def count_uniq(column)
|
148
|
-
aggregate if @topic == @current_topic
|
149
|
-
end
|
150
|
-
|
151
|
-
def sum(column)
|
152
|
-
aggregate if @topic == @current_topic
|
153
193
|
end
|
154
194
|
|
155
195
|
class Topic
|
data/lib/mapred_factory.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
|
-
require '
|
1
|
+
require 'hadoop_dsl'
|
2
2
|
|
3
3
|
module HadoopDsl
|
4
4
|
class MapRedFactory
|
5
5
|
def self.dsl_name(script)
|
6
|
-
read_file(script).each_line do |line|
|
7
|
-
dsl_name = $1 if line =~
|
8
|
-
return dsl_name
|
6
|
+
HadoopDsl.read_file(script).each_line do |line|
|
7
|
+
dsl_name = $1 if line =~ /\s*dsl\s*\(?["'](\w*)["']\)?/
|
8
|
+
return dsl_name if dsl_name
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
12
12
|
def self.require_dsl_lib(dsl_name)
|
13
|
-
require snake_case(dsl_name)
|
13
|
+
require HadoopDsl.snake_case(dsl_name)
|
14
14
|
end
|
15
15
|
end
|
16
16
|
|
data/lib/util.rb
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
# utility functions
|
2
|
+
require 'hadoop_dsl'
|
2
3
|
|
3
4
|
module HadoopDsl
|
4
|
-
def snake_case(str)
|
5
|
+
def self.snake_case(str)
|
5
6
|
str.gsub(/\B[A-Z]/, '_\&').downcase
|
6
7
|
end
|
7
8
|
|
8
|
-
def read_file(file_name)
|
9
|
+
def self.read_file(file_name)
|
9
10
|
# read as usual
|
10
11
|
body = File.open(file_name).read rescue nil
|
11
12
|
return body if body
|
12
13
|
|
13
14
|
# read from loadpath
|
14
15
|
$:.each do |path|
|
15
|
-
p path
|
16
16
|
body = File.open(File.join(path, file_name)).read rescue next
|
17
17
|
return body if body
|
18
18
|
end
|
data/lib/word_count.rb
CHANGED
@@ -1,76 +1,56 @@
|
|
1
|
-
require '
|
1
|
+
require 'hadoop_dsl'
|
2
2
|
require 'enumerator'
|
3
3
|
|
4
4
|
module HadoopDsl::WordCount
|
5
|
-
|
6
|
-
|
7
|
-
AVAILABLE_METHODS = [:count_uniq, :total]
|
5
|
+
MODEL_METHODS = []
|
8
6
|
TOTAL_PREFIX = "\t"
|
9
7
|
|
10
|
-
# common
|
11
|
-
module WordCountMapRed
|
12
|
-
# entry point
|
13
|
-
def data(description = '', &block) yield end
|
14
|
-
end
|
15
|
-
|
16
8
|
# controller
|
17
|
-
class WordCountMapper < BaseMapper
|
9
|
+
class WordCountMapper < HadoopDsl::BaseMapper
|
18
10
|
def initialize(script, key, value)
|
19
11
|
super(script, WordCountMapperModel.new(key, value))
|
20
12
|
end
|
21
13
|
|
22
|
-
include WordCountMapRed
|
23
|
-
|
24
|
-
# model methods
|
25
|
-
def_delegators :@model, *AVAILABLE_METHODS
|
26
|
-
end
|
27
|
-
|
28
|
-
class WordCountReducer < BaseReducer
|
29
|
-
def initialize(script, key, values)
|
30
|
-
super(script, WordCountReducerModel.new(key, values))
|
31
|
-
end
|
32
|
-
|
33
|
-
include WordCountMapRed
|
34
|
-
|
35
14
|
# model methods
|
36
|
-
def_delegators :@model, *
|
37
|
-
end
|
38
|
-
|
39
|
-
# model
|
40
|
-
class WordCountMapperModel < BaseMapperModel
|
41
|
-
def initialize(key, value)
|
42
|
-
super(key, value)
|
43
|
-
end
|
15
|
+
def_delegators :@model, *MODEL_METHODS
|
44
16
|
|
45
17
|
# emitters
|
46
18
|
def count_uniq
|
47
|
-
@value.split.each {|word|
|
19
|
+
@model.value.split.each {|word| emit(word => 1)}
|
48
20
|
end
|
49
21
|
|
50
22
|
def total(*types)
|
51
23
|
types.each do |type|
|
52
24
|
case type
|
53
25
|
when :bytes
|
54
|
-
|
26
|
+
emit("#{TOTAL_PREFIX}total bytes" => @model.value.gsub(/\s/, '').length)
|
55
27
|
when :words
|
56
|
-
|
28
|
+
emit("#{TOTAL_PREFIX}total words" => @model.value.split.size)
|
57
29
|
when :lines
|
58
|
-
|
30
|
+
emit("#{TOTAL_PREFIX}total lines" => 1)
|
59
31
|
end
|
60
32
|
end
|
61
33
|
end
|
62
34
|
end
|
63
35
|
|
64
|
-
class
|
65
|
-
def initialize(key, values)
|
66
|
-
super(key, values)
|
36
|
+
class WordCountReducer < HadoopDsl::BaseReducer
|
37
|
+
def initialize(script, key, values)
|
38
|
+
super(script, WordCountReducerModel.new(key, values))
|
67
39
|
end
|
68
40
|
|
41
|
+
# model methods
|
42
|
+
def_delegators :@model, *MODEL_METHODS
|
43
|
+
|
69
44
|
# emitters
|
70
|
-
def count_uniq; aggregate unless total_value? end
|
71
|
-
def total(*types); aggregate if total_value? end
|
45
|
+
def count_uniq; aggregate unless @model.total_value? end
|
46
|
+
def total(*types); aggregate if @model.total_value? end
|
47
|
+
end
|
48
|
+
|
49
|
+
# model
|
50
|
+
class WordCountMapperModel < HadoopDsl::BaseMapperModel
|
51
|
+
end
|
72
52
|
|
73
|
-
|
53
|
+
class WordCountReducerModel < HadoopDsl::BaseReducerModel
|
74
54
|
def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
|
75
55
|
end
|
76
56
|
end
|
data/spec/client_spec.rb
CHANGED
data/spec/core_spec.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
2
|
require 'core'
|
3
3
|
|
4
4
|
include HadoopDsl
|
@@ -38,7 +38,7 @@ to 'test/outputs'
|
|
38
38
|
it 'can emit as identity' do
|
39
39
|
model = BaseMapperModel.new('key', 'value')
|
40
40
|
mapper = BaseMapper.new(@script, model)
|
41
|
-
|
41
|
+
mapper.identity
|
42
42
|
|
43
43
|
mapper.emitted.should == [{'key' => 'value'}]
|
44
44
|
end
|
@@ -48,7 +48,7 @@ to 'test/outputs'
|
|
48
48
|
it 'can emit as aggregate' do
|
49
49
|
model = BaseReducerModel.new('key', [1, 2, 3])
|
50
50
|
reducer = BaseReducer.new(@script, model)
|
51
|
-
|
51
|
+
reducer.aggregate
|
52
52
|
|
53
53
|
reducer.emitted.should == [{'key' => 6}]
|
54
54
|
end
|
@@ -56,7 +56,7 @@ to 'test/outputs'
|
|
56
56
|
it 'can emit as identity' do
|
57
57
|
model = BaseReducerModel.new('key', [1, 2, 3])
|
58
58
|
reducer = BaseReducer.new(@script, model)
|
59
|
-
|
59
|
+
reducer.identity
|
60
60
|
|
61
61
|
reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}]
|
62
62
|
end
|
data/spec/dsl_init_spec.rb
CHANGED
data/spec/example_spec.rb
CHANGED
@@ -1,23 +1,25 @@
|
|
1
1
|
require 'log_analysis'
|
2
2
|
require 'word_count'
|
3
|
+
require 'hive_like'
|
3
4
|
|
4
5
|
include HadoopDsl::LogAnalysis
|
5
6
|
describe 'Aapach Log Example' do
|
6
7
|
before(:all) do
|
7
|
-
@script = File.join(File.dirname(__FILE__), '..', 'examples', '
|
8
|
-
@
|
8
|
+
@script = File.join(File.dirname(__FILE__), '..', 'examples', 'log_analysis_test.rb')
|
9
|
+
@bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
10
|
+
@value = %Q!127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "-" "#{@bot_ua}"!
|
9
11
|
end
|
10
12
|
|
11
13
|
it 'can run example by mapper' do
|
12
14
|
mapper = LogAnalysisMapper.new(@script, nil, @value)
|
13
15
|
mapper.run
|
14
|
-
mapper.emitted.first
|
16
|
+
mapper.emitted.first.should == {"ua\t#{@bot_ua}" => 1}
|
15
17
|
end
|
16
18
|
|
17
19
|
it 'can run example by reducer' do
|
18
|
-
reducer = LogAnalysisReducer.new(@script, "
|
20
|
+
reducer = LogAnalysisReducer.new(@script, "ua\tChrome", [1, 1, 1])
|
19
21
|
reducer.run
|
20
|
-
reducer.emitted.first["
|
22
|
+
reducer.emitted.first["ua\tChrome"].should == 3
|
21
23
|
end
|
22
24
|
end
|
23
25
|
|
data/spec/hive_like_spec.rb
CHANGED
data/spec/log_analysis_spec.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
2
|
require 'log_analysis'
|
3
3
|
|
4
4
|
include HadoopDsl::LogAnalysis
|
@@ -39,13 +39,22 @@ describe LogAnalysisMapper do
|
|
39
39
|
mapper.column[:user].value.should == 'frank'
|
40
40
|
end
|
41
41
|
|
42
|
-
it 'should count uniq column' do
|
42
|
+
it 'should count uniq by column' do
|
43
43
|
value = 'count uniq'
|
44
44
|
mapper = LogAnalysisMapper.new(nil, nil, value)
|
45
45
|
mapper.separate(' ')
|
46
46
|
mapper.topic('t1') { mapper.count_uniq mapper.column[1] }
|
47
47
|
|
48
|
-
mapper.emitted.
|
48
|
+
mapper.emitted.should == [{"t1\tuniq" => 1}]
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should count uniq by value' do
|
52
|
+
value = 'count uniq'
|
53
|
+
mapper = LogAnalysisMapper.new(nil, nil, value)
|
54
|
+
mapper.separate(' ')
|
55
|
+
mapper.topic('t1') { mapper.count_uniq 'orig value' }
|
56
|
+
|
57
|
+
mapper.emitted.should == [{"t1\torig value" => 1}]
|
49
58
|
end
|
50
59
|
|
51
60
|
it 'should sum column value' do
|
@@ -83,6 +92,54 @@ describe LogAnalysisMapper do
|
|
83
92
|
topic = mapper.topic('desc with space')
|
84
93
|
topic.label.should == 'desc_with_space'
|
85
94
|
end
|
95
|
+
|
96
|
+
it 'can group date monthly' do
|
97
|
+
value = '2010/1/1 newyearday'
|
98
|
+
mapper = LogAnalysisMapper.new(nil, nil, value)
|
99
|
+
mapper.separate(' ')
|
100
|
+
mapper.column_name 'date', 'holiday'
|
101
|
+
|
102
|
+
['yearly', 'monthly', 'daily'].each do |term|
|
103
|
+
mapper.topic(term) do
|
104
|
+
mapper.group_date_by mapper.column[:date], term.to_sym
|
105
|
+
mapper.count_uniq mapper.column[:holiday]
|
106
|
+
end
|
107
|
+
end
|
108
|
+
mapper.emitted.should ==
|
109
|
+
[
|
110
|
+
{"yearly\t2010\tnewyearday" => 1},
|
111
|
+
{"monthly\t201001\tnewyearday" => 1},
|
112
|
+
{"daily\t20100101\tnewyearday" => 1}
|
113
|
+
]
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'can group by' do
|
117
|
+
value = '1 sub_2 bingo!'
|
118
|
+
mapper = LogAnalysisMapper.new(nil, nil, value)
|
119
|
+
mapper.separate(' ')
|
120
|
+
mapper.column_name 'id', 'sub_id', 'data'
|
121
|
+
|
122
|
+
mapper.topic('test') do
|
123
|
+
mapper.group_by mapper.column[:sub_id]
|
124
|
+
mapper.count_uniq mapper.column[:data]
|
125
|
+
end
|
126
|
+
mapper.emitted.should == [{"test\tsub_2\tbingo!" => 1}]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
Topic = LogAnalysisMapperModel::Topic
|
131
|
+
describe Topic do
|
132
|
+
it 'can get key with label' do
|
133
|
+
t = Topic.new('label')
|
134
|
+
t.key.should == 'label'
|
135
|
+
end
|
136
|
+
|
137
|
+
it 'can get key with label and elements' do
|
138
|
+
t = Topic.new('label')
|
139
|
+
t.key_elements << 'e1'
|
140
|
+
t.key_elements << 'e2'
|
141
|
+
t.key.should == "label\te1\te2"
|
142
|
+
end
|
86
143
|
end
|
87
144
|
|
88
145
|
describe LogAnalysisReducer do
|
data/spec/mapred_factory_spec.rb
CHANGED
@@ -1,31 +1,33 @@
|
|
1
1
|
require File.join(File.dirname(__FILE__) , 'spec_helper')
|
2
|
-
|
3
2
|
require 'mapred_factory'
|
4
|
-
require 'log_analysis'
|
5
3
|
|
6
|
-
|
4
|
+
include HadoopDsl
|
7
5
|
|
6
|
+
describe 'MapRed Factory' do
|
8
7
|
before(:all) do
|
9
|
-
@script = create_tmp_script("
|
8
|
+
@script = create_tmp_script("dsl 'LogAnalysis'")
|
10
9
|
end
|
11
10
|
|
12
11
|
it 'can create mapper' do
|
13
12
|
mapper = MapperFactory.create(@script, nil, nil)
|
14
|
-
mapper.class.should == LogAnalysisMapper
|
13
|
+
mapper.class.should == LogAnalysis::LogAnalysisMapper
|
15
14
|
end
|
16
15
|
|
17
16
|
it 'can create reducer' do
|
18
17
|
reducer = ReducerFactory.create(@script, nil, nil)
|
19
|
-
reducer.class.should == LogAnalysisReducer
|
18
|
+
reducer.class.should == LogAnalysis::LogAnalysisReducer
|
20
19
|
end
|
21
20
|
|
22
21
|
it 'can create setup' do
|
23
|
-
|
24
|
-
|
22
|
+
conf = mock('conf')
|
23
|
+
conf.should_receive(:output_key_class=).once
|
24
|
+
conf.should_receive(:output_value_class=).once
|
25
|
+
s = SetupFactory.create(create_tmp_script("dsl 'HiveLike'"), conf)
|
26
|
+
s.class.should == HiveLike::HiveLikeSetup
|
25
27
|
end
|
26
28
|
|
27
29
|
it 'can create base if not exists in specific DSL' do
|
28
|
-
s = SetupFactory.create(create_tmp_script("
|
30
|
+
s = SetupFactory.create(create_tmp_script("dsl 'WordCount'"), nil)
|
29
31
|
s.class.should == BaseSetup
|
30
32
|
end
|
31
33
|
|
@@ -37,6 +39,24 @@ describe 'MapRed Factory' do
|
|
37
39
|
it 'can convert dsl name to dsl lib file and require' do
|
38
40
|
dsl_name = MapRedFactory.dsl_name(@script)
|
39
41
|
MapRedFactory.require_dsl_lib(dsl_name).should_not be_nil
|
40
|
-
LogAnalysisMapper
|
42
|
+
LogAnalysis::LogAnalysisMapper
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'can create mapper if statement has double quote' do
|
46
|
+
script = create_tmp_script(%Q!dsl "LogAnalysis"!)
|
47
|
+
mapper = MapperFactory.create(script, nil, nil)
|
48
|
+
mapper.class.should == LogAnalysis::LogAnalysisMapper
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'can create mapper if exists more space' do
|
52
|
+
script = create_tmp_script(%Q! dsl "LogAnalysis" !)
|
53
|
+
mapper = MapperFactory.create(script, nil, nil)
|
54
|
+
mapper.class.should == LogAnalysis::LogAnalysisMapper
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'can create mapper if exists bracket' do
|
58
|
+
script = create_tmp_script(%Q! dsl ("LogAnalysis") !)
|
59
|
+
mapper = MapperFactory.create(script, nil, nil)
|
60
|
+
mapper.class.should == LogAnalysis::LogAnalysisMapper
|
41
61
|
end
|
42
62
|
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/util_spec.rb
CHANGED
@@ -1,19 +1,18 @@
|
|
1
1
|
require File.join(File.dirname(__FILE__) , 'spec_helper')
|
2
|
-
|
3
2
|
require 'util'
|
4
3
|
|
5
4
|
describe 'utilities' do
|
6
5
|
it 'can change camelcase str to snakecase' do
|
7
|
-
snake_case('CamelCaseStr').should == 'camel_case_str'
|
6
|
+
HadoopDsl.snake_case('CamelCaseStr').should == 'camel_case_str'
|
8
7
|
end
|
9
8
|
|
10
9
|
it 'can read file and get file data to string' do
|
11
10
|
script_body = 'This is a script body.'
|
12
11
|
@script = create_tmp_script(script_body)
|
13
|
-
read_file(@script).should == script_body
|
12
|
+
HadoopDsl.read_file(@script).should == script_body
|
14
13
|
end
|
15
14
|
|
16
15
|
it 'raise error if no file in loadpath' do
|
17
|
-
lambda { read_file('not_exists_on_loadpath') }.should raise_error
|
16
|
+
lambda { HadoopDsl.read_file('not_exists_on_loadpath') }.should raise_error
|
18
17
|
end
|
19
18
|
end
|
data/spec/word_count_spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hadoop-rubydsl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Koichi Fujikawa
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-01-
|
12
|
+
date: 2010-01-13 00:00:00 +09:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -32,27 +32,23 @@ extensions: []
|
|
32
32
|
|
33
33
|
extra_rdoc_files:
|
34
34
|
- README.rdoc
|
35
|
-
- TODO
|
36
35
|
files:
|
37
36
|
- .gitignore
|
38
37
|
- README.rdoc
|
39
38
|
- Rakefile
|
40
|
-
- TODO
|
41
39
|
- VERSION
|
42
40
|
- bin/hadoop-hudson.sh
|
43
41
|
- bin/hadoop-ruby.sh
|
44
42
|
- bin/hrd
|
45
43
|
- conf/hadoop-site.xml
|
46
|
-
- examples/apachelog-v2-2.rb
|
47
|
-
- examples/apachelog-v2.rb
|
48
|
-
- examples/apachelog.rb
|
49
44
|
- examples/hive_like_test.rb
|
45
|
+
- examples/log_analysis_test.rb
|
50
46
|
- examples/word_count_test.rb
|
51
47
|
- hadoop-rubydsl.gemspec
|
52
|
-
- lib/client.rb
|
53
48
|
- lib/core.rb
|
54
49
|
- lib/dsl_init.rb
|
55
|
-
- lib/
|
50
|
+
- lib/hadoop_dsl.rb
|
51
|
+
- lib/hadoop_dsl_client.rb
|
56
52
|
- lib/hive_like.rb
|
57
53
|
- lib/log_analysis.rb
|
58
54
|
- lib/mapred_factory.rb
|
@@ -97,8 +93,6 @@ test_files:
|
|
97
93
|
- spec/hive_like_spec.rb
|
98
94
|
- spec/log_analysis_spec.rb
|
99
95
|
- spec/example_spec.rb
|
100
|
-
- examples/apachelog-v2.rb
|
101
96
|
- examples/hive_like_test.rb
|
97
|
+
- examples/log_analysis_test.rb
|
102
98
|
- examples/word_count_test.rb
|
103
|
-
- examples/apachelog-v2-2.rb
|
104
|
-
- examples/apachelog.rb
|
data/TODO
DELETED
data/examples/apachelog-v2.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
use 'LogAnalysis'
|
2
|
-
|
3
|
-
data 'apache log on test1' do
|
4
|
-
from 'apachlog/inputs'
|
5
|
-
to 'apachlog/outputs'
|
6
|
-
|
7
|
-
each_line do
|
8
|
-
pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
|
9
|
-
column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes' # 各カラムにラベルをつける
|
10
|
-
|
11
|
-
topic 'which users?', :label => 'user' do
|
12
|
-
count_uniq column[:user]
|
13
|
-
end
|
14
|
-
|
15
|
-
# topic 'access date by monthly' do
|
16
|
-
# select_date column[:access_date], BY_MONTHLY
|
17
|
-
# count column[:access_date]
|
18
|
-
# end
|
19
|
-
#
|
20
|
-
# topic 'total bytes' do
|
21
|
-
# select_date column[:access_date], BY_MONTHLY
|
22
|
-
# sum column[:bytes].to_kilobytes # / 1024
|
23
|
-
# end
|
24
|
-
end
|
25
|
-
end
|
data/examples/apachelog.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
# Apache log analysis
|
2
|
-
#
|
3
|
-
# example target data:
|
4
|
-
# 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
|
5
|
-
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
|
6
|
-
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
|
7
|
-
|
8
|
-
use 'LogAnalysis'
|
9
|
-
|
10
|
-
data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
|
11
|
-
column[2].count_uniq
|
12
|
-
column[3].count_uniq
|
13
|
-
column[4].count_uniq
|
14
|
-
column[5].count_uniq
|
15
|
-
column[6].sum
|