hadoop-rubydsl 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/bin/hrd +1 -1
- data/examples/hive_like_test.rb +1 -1
- data/examples/{apachelog-v2-2.rb → log_analysis_test.rb} +24 -1
- data/examples/word_count_test.rb +1 -1
- data/hadoop-rubydsl.gemspec +8 -14
- data/lib/core.rb +46 -49
- data/lib/dsl_init.rb +1 -10
- data/lib/hadoop_dsl.rb +14 -0
- data/lib/{client.rb → hadoop_dsl_client.rb} +10 -0
- data/lib/hive_like.rb +24 -40
- data/lib/log_analysis.rb +109 -69
- data/lib/mapred_factory.rb +5 -5
- data/lib/util.rb +3 -3
- data/lib/word_count.rb +22 -42
- data/spec/client_spec.rb +2 -1
- data/spec/core_spec.rb +4 -4
- data/spec/dsl_init_spec.rb +1 -1
- data/spec/example_spec.rb +7 -5
- data/spec/hive_like_spec.rb +1 -1
- data/spec/log_analysis_spec.rb +60 -3
- data/spec/mapred_factory_spec.rb +30 -10
- data/spec/spec_helper.rb +0 -1
- data/spec/util_spec.rb +3 -4
- data/spec/word_count_spec.rb +1 -1
- metadata +6 -12
- data/TODO +0 -2
- data/examples/apachelog-v2.rb +0 -25
- data/examples/apachelog.rb +0 -15
- data/lib/hadoop-dsl.rb +0 -12
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.4
|
data/bin/hrd
CHANGED
data/examples/hive_like_test.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
dsl 'LogAnalysis'
|
2
2
|
|
3
3
|
data 'apache log on test2' do
|
4
4
|
from 'apachelog/inputs'
|
@@ -14,5 +14,28 @@ data 'apache log on test2' do
|
|
14
14
|
topic 'ua counts', :label => 'ua' do
|
15
15
|
count_uniq column[:ua]
|
16
16
|
end
|
17
|
+
|
18
|
+
topic 'count bot', :label => 'bot' do
|
19
|
+
ua = column[:ua].value
|
20
|
+
bot = ua if ua =~ /bot/i
|
21
|
+
count_uniq bot
|
22
|
+
end
|
23
|
+
|
24
|
+
topic 'ua counts group by path' do
|
25
|
+
request = column[:request].value
|
26
|
+
path = request.split(/\s+/)[1]
|
27
|
+
group_by path
|
28
|
+
count_uniq column[:ua]
|
29
|
+
end
|
30
|
+
|
31
|
+
topic 'ua counts by daily' do
|
32
|
+
group_date_by column[:access_date], :daily
|
33
|
+
count_uniq column[:ua]
|
34
|
+
end
|
35
|
+
|
36
|
+
# topic 'total bytes' do
|
37
|
+
# select_date column[:access_date], BY_MONTHLY
|
38
|
+
# sum column[:bytes].to_kilobytes # / 1024
|
39
|
+
# end
|
17
40
|
end
|
18
41
|
end
|
data/examples/word_count_test.rb
CHANGED
data/hadoop-rubydsl.gemspec
CHANGED
@@ -5,38 +5,34 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{hadoop-rubydsl}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Koichi Fujikawa"]
|
12
|
-
s.date = %q{2010-01-
|
12
|
+
s.date = %q{2010-01-13}
|
13
13
|
s.description = %q{Hadoop Ruby DSL}
|
14
14
|
s.email = %q{fujibee@gmail.com}
|
15
15
|
s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
|
16
16
|
s.extra_rdoc_files = [
|
17
|
-
"README.rdoc"
|
18
|
-
"TODO"
|
17
|
+
"README.rdoc"
|
19
18
|
]
|
20
19
|
s.files = [
|
21
20
|
".gitignore",
|
22
21
|
"README.rdoc",
|
23
22
|
"Rakefile",
|
24
|
-
"TODO",
|
25
23
|
"VERSION",
|
26
24
|
"bin/hadoop-hudson.sh",
|
27
25
|
"bin/hadoop-ruby.sh",
|
28
26
|
"bin/hrd",
|
29
27
|
"conf/hadoop-site.xml",
|
30
|
-
"examples/apachelog-v2-2.rb",
|
31
|
-
"examples/apachelog-v2.rb",
|
32
|
-
"examples/apachelog.rb",
|
33
28
|
"examples/hive_like_test.rb",
|
29
|
+
"examples/log_analysis_test.rb",
|
34
30
|
"examples/word_count_test.rb",
|
35
31
|
"hadoop-rubydsl.gemspec",
|
36
|
-
"lib/client.rb",
|
37
32
|
"lib/core.rb",
|
38
33
|
"lib/dsl_init.rb",
|
39
|
-
"lib/
|
34
|
+
"lib/hadoop_dsl.rb",
|
35
|
+
"lib/hadoop_dsl_client.rb",
|
40
36
|
"lib/hive_like.rb",
|
41
37
|
"lib/log_analysis.rb",
|
42
38
|
"lib/mapred_factory.rb",
|
@@ -59,11 +55,9 @@ Gem::Specification.new do |s|
|
|
59
55
|
"spec/hive_like_spec.rb",
|
60
56
|
"spec/log_analysis_spec.rb",
|
61
57
|
"spec/example_spec.rb",
|
62
|
-
"examples/apachelog-v2.rb",
|
63
58
|
"examples/hive_like_test.rb",
|
64
|
-
"examples/
|
65
|
-
"examples/
|
66
|
-
"examples/apachelog.rb"
|
59
|
+
"examples/log_analysis_test.rb",
|
60
|
+
"examples/word_count_test.rb"
|
67
61
|
]
|
68
62
|
|
69
63
|
if s.respond_to? :specification_version then
|
data/lib/core.rb
CHANGED
@@ -1,10 +1,33 @@
|
|
1
|
-
require '
|
1
|
+
require 'hadoop_dsl'
|
2
2
|
require 'forwardable'
|
3
3
|
|
4
4
|
module HadoopDsl
|
5
|
+
# common
|
6
|
+
module DslElement
|
7
|
+
# all DSL statements without def is processed here
|
8
|
+
def method_missing(method_name, *args)
|
9
|
+
yield if block_given?
|
10
|
+
self
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
5
14
|
# controller
|
15
|
+
module DslController
|
16
|
+
include DslElement
|
17
|
+
|
18
|
+
def run
|
19
|
+
body = pre_process(HadoopDsl.read_file(@script))
|
20
|
+
eval(body, binding, @script)
|
21
|
+
end
|
22
|
+
|
23
|
+
def pre_process(body)
|
24
|
+
body # do nothing
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
6
28
|
class BaseMapRed
|
7
29
|
extend Forwardable
|
30
|
+
include DslController
|
8
31
|
|
9
32
|
attr_reader :emitted
|
10
33
|
|
@@ -14,66 +37,54 @@ module HadoopDsl
|
|
14
37
|
@emitted = []
|
15
38
|
end
|
16
39
|
|
17
|
-
def run
|
18
|
-
body = pre_process(read_file(@script))
|
19
|
-
eval(body, binding, @script)
|
20
|
-
end
|
21
|
-
|
22
|
-
def pre_process(body)
|
23
|
-
body # do nothing
|
24
|
-
end
|
25
|
-
|
26
40
|
def emit(hash) @emitted << hash end
|
27
41
|
|
28
|
-
|
29
|
-
def
|
42
|
+
private
|
43
|
+
def key; @model.key end
|
30
44
|
end
|
31
45
|
|
32
46
|
class BaseSetup
|
47
|
+
include DslController
|
48
|
+
|
33
49
|
def initialize(script, conf)
|
34
50
|
@script, @conf = script, conf
|
35
51
|
output_format
|
36
52
|
end
|
37
53
|
|
38
|
-
def
|
39
|
-
body = pre_process(read_file(@script))
|
40
|
-
eval(body, binding, @script)
|
41
|
-
end
|
42
|
-
|
43
|
-
def pre_process(body)
|
44
|
-
body # do nothing
|
45
|
-
end
|
46
|
-
|
47
|
-
# do nothing
|
48
|
-
def output_format; end
|
49
|
-
|
54
|
+
def output_format; end # do nothing
|
50
55
|
def paths; [@from, @to] end
|
51
|
-
|
52
56
|
def from(path) @from = path end
|
53
57
|
def to(path) @to = path end
|
54
|
-
|
55
|
-
# all DSL statements without def is processed here
|
56
|
-
def method_missing(method_name, *args) self end
|
57
58
|
end
|
58
59
|
|
59
60
|
class BaseMapper < BaseMapRed
|
60
|
-
|
61
|
-
|
61
|
+
# common functions
|
62
|
+
def identity
|
63
|
+
emit(@model.key => @model.value)
|
62
64
|
end
|
65
|
+
|
66
|
+
private
|
67
|
+
def value; @model.values end
|
63
68
|
end
|
64
69
|
|
65
70
|
class BaseReducer < BaseMapRed
|
66
|
-
|
67
|
-
|
71
|
+
# common functions
|
72
|
+
def aggregate
|
73
|
+
emit(@model.key => @model.values.inject {|ret, i| ret + i})
|
74
|
+
end
|
75
|
+
|
76
|
+
def identity
|
77
|
+
@model.values.each {|v| emit(@model.key => v)}
|
68
78
|
end
|
79
|
+
|
80
|
+
private
|
81
|
+
def values; @model.values end
|
69
82
|
end
|
70
83
|
|
71
84
|
# model
|
72
85
|
class BaseModel
|
86
|
+
include DslElement
|
73
87
|
attr_accessor :controller
|
74
|
-
|
75
|
-
# all DSL statements without def is processed here
|
76
|
-
def method_missing(method_name, *args) self end
|
77
88
|
end
|
78
89
|
|
79
90
|
class BaseMapperModel < BaseModel
|
@@ -82,11 +93,6 @@ module HadoopDsl
|
|
82
93
|
def initialize(key, value)
|
83
94
|
@key, @value = key, value
|
84
95
|
end
|
85
|
-
|
86
|
-
# common functions
|
87
|
-
def identity
|
88
|
-
@controller.emit(@key => @value)
|
89
|
-
end
|
90
96
|
end
|
91
97
|
|
92
98
|
class BaseReducerModel < BaseModel
|
@@ -95,14 +101,5 @@ module HadoopDsl
|
|
95
101
|
def initialize(key, values)
|
96
102
|
@key, @values = key, values
|
97
103
|
end
|
98
|
-
|
99
|
-
# common functions
|
100
|
-
def aggregate
|
101
|
-
@controller.emit(@key => @values.inject {|ret, i| ret + i})
|
102
|
-
end
|
103
|
-
|
104
|
-
def identity
|
105
|
-
@values.each {|v| @controller.emit(@key => v)}
|
106
|
-
end
|
107
104
|
end
|
108
105
|
end
|
data/lib/dsl_init.rb
CHANGED
@@ -1,16 +1,7 @@
|
|
1
|
-
require '
|
2
|
-
require 'java'
|
3
|
-
require 'mapred_factory'
|
4
|
-
|
5
|
-
import 'org.apache.hadoop.io.IntWritable'
|
6
|
-
import 'org.apache.hadoop.io.Text'
|
1
|
+
require 'hadoop_dsl'
|
7
2
|
|
8
3
|
include HadoopDsl
|
9
4
|
|
10
|
-
# Hadoop IO types
|
11
|
-
HadoopDsl::Text = Text
|
12
|
-
HadoopDsl::IntWritable = IntWritable
|
13
|
-
|
14
5
|
def map(key, value, output, reporter, script)
|
15
6
|
mapper = MapperFactory.create(script, key, value)
|
16
7
|
mapper.run
|
data/lib/hadoop_dsl.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'util'
|
2
|
+
require 'mapred_factory'
|
3
|
+
require 'core'
|
4
|
+
|
5
|
+
# for jruby
|
6
|
+
if defined? JRUBY_VERSION
|
7
|
+
require 'java'
|
8
|
+
import 'org.apache.hadoop.io.IntWritable'
|
9
|
+
import 'org.apache.hadoop.io.Text'
|
10
|
+
|
11
|
+
# Hadoop IO types
|
12
|
+
HadoopDsl::Text = Text
|
13
|
+
HadoopDsl::IntWritable = IntWritable
|
14
|
+
end
|
@@ -1,4 +1,14 @@
|
|
1
|
+
require 'jruby-on-hadoop'
|
2
|
+
|
1
3
|
module HadoopDsl
|
4
|
+
def self.lib_path
|
5
|
+
File.expand_path(File.dirname(__FILE__))
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.dsl_init_script
|
9
|
+
File.join(lib_path, "dsl_init.rb")
|
10
|
+
end
|
11
|
+
|
2
12
|
class Client < JRubyOnHadoop::Client
|
3
13
|
def parse_args
|
4
14
|
super
|
data/lib/hive_like.rb
CHANGED
@@ -1,11 +1,6 @@
|
|
1
|
-
require '
|
2
|
-
require 'enumerator'
|
1
|
+
require 'hadoop_dsl'
|
3
2
|
|
4
3
|
module HadoopDsl::HiveLike
|
5
|
-
include HadoopDsl
|
6
|
-
|
7
|
-
AVAILABLE_METHODS = [:select, :create_table, :table]
|
8
|
-
|
9
4
|
# common
|
10
5
|
module HiveLikeMapRed
|
11
6
|
def pre_process(body)
|
@@ -17,7 +12,7 @@ module HadoopDsl::HiveLike
|
|
17
12
|
args = sprit_and_marge_args($2)
|
18
13
|
processed << "#{method}(#{args})\n"
|
19
14
|
else
|
20
|
-
processed << line + "\n"
|
15
|
+
processed << line + "\n" if line
|
21
16
|
end
|
22
17
|
end
|
23
18
|
processed
|
@@ -32,15 +27,15 @@ module HadoopDsl::HiveLike
|
|
32
27
|
end
|
33
28
|
|
34
29
|
# controller
|
35
|
-
class HiveLikeSetup < BaseSetup
|
30
|
+
class HiveLikeSetup < HadoopDsl::BaseSetup
|
36
31
|
def load_data(inputs, table)
|
37
32
|
@from = inputs
|
38
33
|
@to = inputs.gsub(/#{File.basename(inputs)}$/, 'outputs')
|
39
34
|
end
|
40
35
|
|
41
36
|
def output_format
|
42
|
-
@conf.output_key_class = Text
|
43
|
-
@conf.output_value_class = Text
|
37
|
+
@conf.output_key_class = HadoopDsl::Text
|
38
|
+
@conf.output_value_class = HadoopDsl::Text
|
44
39
|
end
|
45
40
|
|
46
41
|
# might not need but occur error if not exists
|
@@ -49,37 +44,43 @@ module HadoopDsl::HiveLike
|
|
49
44
|
include HiveLikeMapRed
|
50
45
|
end
|
51
46
|
|
52
|
-
class HiveLikeMapper < BaseMapper
|
47
|
+
class HiveLikeMapper < HadoopDsl::BaseMapper
|
53
48
|
def initialize(script, key, value)
|
54
49
|
super(script, HiveLikeMapperModel.new(key, value))
|
55
50
|
end
|
56
51
|
|
57
52
|
include HiveLikeMapRed
|
58
53
|
|
59
|
-
|
60
|
-
|
54
|
+
def_delegators :@model, :create_table, :table
|
55
|
+
|
56
|
+
# emitters
|
57
|
+
def select(*args)
|
58
|
+
from_index = args.index('from')
|
59
|
+
if from_index
|
60
|
+
values = args[0...from_index].map do |column|
|
61
|
+
splitted = @model.value.split(/[,\s]+/)
|
62
|
+
splitted[@model.table.columns.index(column)]
|
63
|
+
end
|
64
|
+
emit(args[from_index + 1] => values.join(", "))
|
65
|
+
end
|
66
|
+
end
|
61
67
|
end
|
62
68
|
|
63
|
-
class HiveLikeReducer < BaseReducer
|
69
|
+
class HiveLikeReducer < HadoopDsl::BaseReducer
|
64
70
|
def initialize(script, key, values)
|
65
71
|
super(script, HiveLikeReducerModel.new(key, values))
|
66
72
|
end
|
67
73
|
|
68
74
|
include HiveLikeMapRed
|
69
75
|
|
70
|
-
#
|
71
|
-
|
76
|
+
# emitters
|
77
|
+
def select(*args) identity end
|
72
78
|
end
|
73
79
|
|
74
80
|
# model
|
75
|
-
class HiveLikeMapperModel < BaseMapperModel
|
81
|
+
class HiveLikeMapperModel < HadoopDsl::BaseMapperModel
|
76
82
|
attr_reader :table
|
77
83
|
|
78
|
-
def initialize(key, value)
|
79
|
-
super(key, value)
|
80
|
-
end
|
81
|
-
|
82
|
-
# emitters
|
83
84
|
def create_table(name, *column_and_type)
|
84
85
|
@table = Table.new(name)
|
85
86
|
column_and_type.each_with_index do |column, index|
|
@@ -88,17 +89,6 @@ module HadoopDsl::HiveLike
|
|
88
89
|
end
|
89
90
|
end
|
90
91
|
|
91
|
-
def select(*args)
|
92
|
-
from_index = args.index('from')
|
93
|
-
if from_index
|
94
|
-
values = args[0...from_index].map do |column|
|
95
|
-
splitted = @value.split(/[,\s]+/)
|
96
|
-
splitted[@table.columns.index(column)]
|
97
|
-
end
|
98
|
-
@controller.emit(args[from_index + 1] => values.join(", "))
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
92
|
class Table
|
103
93
|
attr_reader :name, :columns
|
104
94
|
|
@@ -111,12 +101,6 @@ module HadoopDsl::HiveLike
|
|
111
101
|
end
|
112
102
|
end
|
113
103
|
|
114
|
-
class HiveLikeReducerModel < BaseReducerModel
|
115
|
-
def initialize(key, values)
|
116
|
-
super(key, values)
|
117
|
-
end
|
118
|
-
|
119
|
-
# emitters
|
120
|
-
def select(*args) identity end
|
104
|
+
class HiveLikeReducerModel < HadoopDsl::BaseReducerModel
|
121
105
|
end
|
122
106
|
end
|
data/lib/log_analysis.rb
CHANGED
@@ -1,55 +1,123 @@
|
|
1
|
-
require '
|
1
|
+
require 'hadoop_dsl'
|
2
2
|
require 'enumerator'
|
3
3
|
|
4
4
|
module HadoopDsl::LogAnalysis
|
5
|
-
include HadoopDsl
|
6
|
-
|
7
5
|
KEY_SEP = "\t"
|
8
6
|
PREFIX = 'col'
|
9
7
|
PASS = nil
|
10
|
-
|
8
|
+
MODEL_METHODS = [:column, :value]
|
11
9
|
|
12
|
-
#
|
13
|
-
|
14
|
-
|
15
|
-
|
10
|
+
# controller
|
11
|
+
class LogAnalysisMapper < HadoopDsl::BaseMapper
|
12
|
+
def initialize(script, key, value)
|
13
|
+
super(script, LogAnalysisMapperModel.new(key, value))
|
14
|
+
end
|
16
15
|
|
17
|
-
|
18
|
-
|
16
|
+
# model methods
|
17
|
+
def_delegators :@model, *MODEL_METHODS
|
18
|
+
|
19
|
+
def topic(desc, options = {}, &block)
|
20
|
+
@model.create_topic(desc, options)
|
21
|
+
yield if block_given?
|
22
|
+
current_topic
|
23
|
+
end
|
19
24
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
super(script, conf)
|
25
|
+
def separate(sep)
|
26
|
+
parts = value.split(sep)
|
27
|
+
@model.create_or_replace_columns_with(parts) {|column, value| column.value = value}
|
24
28
|
end
|
25
29
|
|
26
|
-
|
27
|
-
|
30
|
+
def pattern(re)
|
31
|
+
if value =~ re
|
32
|
+
md = Regexp.last_match
|
33
|
+
@model.create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
|
34
|
+
end
|
35
|
+
end
|
28
36
|
|
29
|
-
|
30
|
-
def
|
31
|
-
|
37
|
+
# column names by String converted to Symbol
|
38
|
+
def column_name(*names)
|
39
|
+
sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
|
40
|
+
@model.create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
|
32
41
|
end
|
33
42
|
|
34
|
-
|
43
|
+
def group_by(column_or_value)
|
44
|
+
case column_or_value
|
45
|
+
when LogAnalysisMapperModel::Column
|
46
|
+
column = column_or_value
|
47
|
+
current_topic.key_elements << column.value
|
48
|
+
else
|
49
|
+
value = column_or_value
|
50
|
+
current_topic.key_elements << value
|
51
|
+
end
|
52
|
+
end
|
35
53
|
|
36
|
-
|
37
|
-
|
54
|
+
def group_date_by(column, term)
|
55
|
+
require 'time'
|
56
|
+
time = parse_time(column.value)
|
57
|
+
time_key = case term
|
58
|
+
when :daily then time.strftime('%Y%m%d')
|
59
|
+
when :monthly then time.strftime('%Y%m')
|
60
|
+
when :yearly then time.strftime('%Y')
|
61
|
+
end
|
62
|
+
current_topic.key_elements << time_key
|
63
|
+
end
|
64
|
+
|
65
|
+
# emitters
|
66
|
+
def count_uniq(column_or_value)
|
67
|
+
uniq_key =
|
68
|
+
case column_or_value
|
69
|
+
when LogAnalysisMapperModel::Column
|
70
|
+
column = column_or_value
|
71
|
+
column.value
|
72
|
+
else column_or_value # value
|
73
|
+
end
|
74
|
+
current_topic.key_elements << uniq_key
|
75
|
+
emit(current_topic.key => 1)
|
76
|
+
end
|
77
|
+
|
78
|
+
def sum(column)
|
79
|
+
emit(current_topic.key => column.value.to_i)
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
def current_topic; @model.current_topic end
|
84
|
+
|
85
|
+
def parse_time(str)
|
86
|
+
begin Time.parse(str)
|
87
|
+
rescue
|
88
|
+
# apachelog pattern ex) "10/Oct/2000:13:55:36 -0700"
|
89
|
+
Time.parse($1) if str =~ /^(\d*\/\w*\/\d*):/
|
90
|
+
end
|
91
|
+
end
|
38
92
|
end
|
39
93
|
|
40
|
-
class LogAnalysisReducer < BaseReducer
|
94
|
+
class LogAnalysisReducer < HadoopDsl::BaseReducer
|
41
95
|
def initialize(script, key, values)
|
42
96
|
super(script, LogAnalysisReducerModel.new(key, values))
|
43
97
|
end
|
44
98
|
|
45
|
-
include LogAnalysisMapRed
|
46
|
-
|
47
99
|
# model methods
|
48
|
-
def_delegators :@model, *
|
100
|
+
def_delegators :@model, *MODEL_METHODS
|
101
|
+
|
102
|
+
def topic(desc, options = {}, &block)
|
103
|
+
@model.create_topic(desc, options)
|
104
|
+
yield if block_given?
|
105
|
+
@model.current_topic
|
106
|
+
end
|
107
|
+
|
108
|
+
def count_uniq(column)
|
109
|
+
aggregate if @model.topic == @model.current_topic
|
110
|
+
end
|
111
|
+
|
112
|
+
def sum(column)
|
113
|
+
aggregate if @model.topic == @model.current_topic
|
114
|
+
end
|
49
115
|
end
|
50
116
|
|
51
117
|
# model
|
52
|
-
class LogAnalysisMapperModel < BaseMapperModel
|
118
|
+
class LogAnalysisMapperModel < HadoopDsl::BaseMapperModel
|
119
|
+
attr_reader :current_topic
|
120
|
+
|
53
121
|
def initialize(key, value)
|
54
122
|
super(key, value)
|
55
123
|
@columns = ColumnArray.new
|
@@ -58,28 +126,8 @@ module HadoopDsl::LogAnalysis
|
|
58
126
|
|
59
127
|
def column; @columns end
|
60
128
|
|
61
|
-
def
|
129
|
+
def create_topic(desc, options)
|
62
130
|
@topics << @current_topic = Topic.new(desc, options[:label])
|
63
|
-
yield if block_given?
|
64
|
-
@current_topic
|
65
|
-
end
|
66
|
-
|
67
|
-
def separate(sep)
|
68
|
-
parts = @value.split(sep)
|
69
|
-
create_or_replace_columns_with(parts) {|column, value| column.value = value}
|
70
|
-
end
|
71
|
-
|
72
|
-
def pattern(re)
|
73
|
-
if @value =~ re
|
74
|
-
md = Regexp.last_match
|
75
|
-
create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
|
76
|
-
end
|
77
|
-
end
|
78
|
-
|
79
|
-
# column names by String converted to Symbol
|
80
|
-
def column_name(*names)
|
81
|
-
sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
|
82
|
-
create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
|
83
131
|
end
|
84
132
|
|
85
133
|
def create_or_replace_columns_with(array, &block)
|
@@ -91,15 +139,6 @@ module HadoopDsl::LogAnalysis
|
|
91
139
|
@columns = ColumnArray.new(columns)
|
92
140
|
end
|
93
141
|
|
94
|
-
# emitters
|
95
|
-
def count_uniq(column)
|
96
|
-
@controller.emit([@current_topic.label, KEY_SEP, column.value].join => 1)
|
97
|
-
end
|
98
|
-
|
99
|
-
def sum(column)
|
100
|
-
@controller.emit([@current_topic.label].join => column.value.to_i)
|
101
|
-
end
|
102
|
-
|
103
142
|
class ColumnArray < Array
|
104
143
|
def [](key)
|
105
144
|
case key
|
@@ -120,17 +159,28 @@ module HadoopDsl::LogAnalysis
|
|
120
159
|
end
|
121
160
|
|
122
161
|
class Topic
|
162
|
+
attr_reader :key_elements
|
163
|
+
|
123
164
|
def initialize(desc, label = nil)
|
124
165
|
@desc, @label = desc, label
|
166
|
+
@key_elements = []
|
125
167
|
end
|
126
168
|
|
127
169
|
def label
|
128
170
|
@label || @desc.gsub(/\s/, '_')
|
129
171
|
end
|
172
|
+
|
173
|
+
def key
|
174
|
+
without_label =
|
175
|
+
@key_elements.size > 0 ? @key_elements.join(KEY_SEP) : nil
|
176
|
+
[label, without_label].compact.join(KEY_SEP)
|
177
|
+
end
|
130
178
|
end
|
131
179
|
end
|
132
180
|
|
133
|
-
class LogAnalysisReducerModel < BaseReducerModel
|
181
|
+
class LogAnalysisReducerModel < HadoopDsl::BaseReducerModel
|
182
|
+
attr_reader :topic, :current_topic
|
183
|
+
|
134
184
|
def initialize(key, values)
|
135
185
|
super(key, values)
|
136
186
|
if key =~ /(\w*)#{KEY_SEP}?(.*)/
|
@@ -138,18 +188,8 @@ module HadoopDsl::LogAnalysis
|
|
138
188
|
end
|
139
189
|
end
|
140
190
|
|
141
|
-
def
|
191
|
+
def create_topic(desc, options)
|
142
192
|
@current_topic = Topic.new(options[:label] || desc.gsub(/\s/, '_'), nil)
|
143
|
-
yield if block_given?
|
144
|
-
@current_topic
|
145
|
-
end
|
146
|
-
|
147
|
-
def count_uniq(column)
|
148
|
-
aggregate if @topic == @current_topic
|
149
|
-
end
|
150
|
-
|
151
|
-
def sum(column)
|
152
|
-
aggregate if @topic == @current_topic
|
153
193
|
end
|
154
194
|
|
155
195
|
class Topic
|
data/lib/mapred_factory.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
|
-
require '
|
1
|
+
require 'hadoop_dsl'
|
2
2
|
|
3
3
|
module HadoopDsl
|
4
4
|
class MapRedFactory
|
5
5
|
def self.dsl_name(script)
|
6
|
-
read_file(script).each_line do |line|
|
7
|
-
dsl_name = $1 if line =~
|
8
|
-
return dsl_name
|
6
|
+
HadoopDsl.read_file(script).each_line do |line|
|
7
|
+
dsl_name = $1 if line =~ /\s*dsl\s*\(?["'](\w*)["']\)?/
|
8
|
+
return dsl_name if dsl_name
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
12
12
|
def self.require_dsl_lib(dsl_name)
|
13
|
-
require snake_case(dsl_name)
|
13
|
+
require HadoopDsl.snake_case(dsl_name)
|
14
14
|
end
|
15
15
|
end
|
16
16
|
|
data/lib/util.rb
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
# utility functions
|
2
|
+
require 'hadoop_dsl'
|
2
3
|
|
3
4
|
module HadoopDsl
|
4
|
-
def snake_case(str)
|
5
|
+
def self.snake_case(str)
|
5
6
|
str.gsub(/\B[A-Z]/, '_\&').downcase
|
6
7
|
end
|
7
8
|
|
8
|
-
def read_file(file_name)
|
9
|
+
def self.read_file(file_name)
|
9
10
|
# read as usual
|
10
11
|
body = File.open(file_name).read rescue nil
|
11
12
|
return body if body
|
12
13
|
|
13
14
|
# read from loadpath
|
14
15
|
$:.each do |path|
|
15
|
-
p path
|
16
16
|
body = File.open(File.join(path, file_name)).read rescue next
|
17
17
|
return body if body
|
18
18
|
end
|
data/lib/word_count.rb
CHANGED
@@ -1,76 +1,56 @@
|
|
1
|
-
require '
|
1
|
+
require 'hadoop_dsl'
|
2
2
|
require 'enumerator'
|
3
3
|
|
4
4
|
module HadoopDsl::WordCount
|
5
|
-
|
6
|
-
|
7
|
-
AVAILABLE_METHODS = [:count_uniq, :total]
|
5
|
+
MODEL_METHODS = []
|
8
6
|
TOTAL_PREFIX = "\t"
|
9
7
|
|
10
|
-
# common
|
11
|
-
module WordCountMapRed
|
12
|
-
# entry point
|
13
|
-
def data(description = '', &block) yield end
|
14
|
-
end
|
15
|
-
|
16
8
|
# controller
|
17
|
-
class WordCountMapper < BaseMapper
|
9
|
+
class WordCountMapper < HadoopDsl::BaseMapper
|
18
10
|
def initialize(script, key, value)
|
19
11
|
super(script, WordCountMapperModel.new(key, value))
|
20
12
|
end
|
21
13
|
|
22
|
-
include WordCountMapRed
|
23
|
-
|
24
|
-
# model methods
|
25
|
-
def_delegators :@model, *AVAILABLE_METHODS
|
26
|
-
end
|
27
|
-
|
28
|
-
class WordCountReducer < BaseReducer
|
29
|
-
def initialize(script, key, values)
|
30
|
-
super(script, WordCountReducerModel.new(key, values))
|
31
|
-
end
|
32
|
-
|
33
|
-
include WordCountMapRed
|
34
|
-
|
35
14
|
# model methods
|
36
|
-
def_delegators :@model, *
|
37
|
-
end
|
38
|
-
|
39
|
-
# model
|
40
|
-
class WordCountMapperModel < BaseMapperModel
|
41
|
-
def initialize(key, value)
|
42
|
-
super(key, value)
|
43
|
-
end
|
15
|
+
def_delegators :@model, *MODEL_METHODS
|
44
16
|
|
45
17
|
# emitters
|
46
18
|
def count_uniq
|
47
|
-
@value.split.each {|word|
|
19
|
+
@model.value.split.each {|word| emit(word => 1)}
|
48
20
|
end
|
49
21
|
|
50
22
|
def total(*types)
|
51
23
|
types.each do |type|
|
52
24
|
case type
|
53
25
|
when :bytes
|
54
|
-
|
26
|
+
emit("#{TOTAL_PREFIX}total bytes" => @model.value.gsub(/\s/, '').length)
|
55
27
|
when :words
|
56
|
-
|
28
|
+
emit("#{TOTAL_PREFIX}total words" => @model.value.split.size)
|
57
29
|
when :lines
|
58
|
-
|
30
|
+
emit("#{TOTAL_PREFIX}total lines" => 1)
|
59
31
|
end
|
60
32
|
end
|
61
33
|
end
|
62
34
|
end
|
63
35
|
|
64
|
-
class
|
65
|
-
def initialize(key, values)
|
66
|
-
super(key, values)
|
36
|
+
class WordCountReducer < HadoopDsl::BaseReducer
|
37
|
+
def initialize(script, key, values)
|
38
|
+
super(script, WordCountReducerModel.new(key, values))
|
67
39
|
end
|
68
40
|
|
41
|
+
# model methods
|
42
|
+
def_delegators :@model, *MODEL_METHODS
|
43
|
+
|
69
44
|
# emitters
|
70
|
-
def count_uniq; aggregate unless total_value? end
|
71
|
-
def total(*types); aggregate if total_value? end
|
45
|
+
def count_uniq; aggregate unless @model.total_value? end
|
46
|
+
def total(*types); aggregate if @model.total_value? end
|
47
|
+
end
|
48
|
+
|
49
|
+
# model
|
50
|
+
class WordCountMapperModel < HadoopDsl::BaseMapperModel
|
51
|
+
end
|
72
52
|
|
73
|
-
|
53
|
+
class WordCountReducerModel < HadoopDsl::BaseReducerModel
|
74
54
|
def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
|
75
55
|
end
|
76
56
|
end
|
data/spec/client_spec.rb
CHANGED
data/spec/core_spec.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
2
|
require 'core'
|
3
3
|
|
4
4
|
include HadoopDsl
|
@@ -38,7 +38,7 @@ to 'test/outputs'
|
|
38
38
|
it 'can emit as identity' do
|
39
39
|
model = BaseMapperModel.new('key', 'value')
|
40
40
|
mapper = BaseMapper.new(@script, model)
|
41
|
-
|
41
|
+
mapper.identity
|
42
42
|
|
43
43
|
mapper.emitted.should == [{'key' => 'value'}]
|
44
44
|
end
|
@@ -48,7 +48,7 @@ to 'test/outputs'
|
|
48
48
|
it 'can emit as aggregate' do
|
49
49
|
model = BaseReducerModel.new('key', [1, 2, 3])
|
50
50
|
reducer = BaseReducer.new(@script, model)
|
51
|
-
|
51
|
+
reducer.aggregate
|
52
52
|
|
53
53
|
reducer.emitted.should == [{'key' => 6}]
|
54
54
|
end
|
@@ -56,7 +56,7 @@ to 'test/outputs'
|
|
56
56
|
it 'can emit as identity' do
|
57
57
|
model = BaseReducerModel.new('key', [1, 2, 3])
|
58
58
|
reducer = BaseReducer.new(@script, model)
|
59
|
-
|
59
|
+
reducer.identity
|
60
60
|
|
61
61
|
reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}]
|
62
62
|
end
|
data/spec/dsl_init_spec.rb
CHANGED
data/spec/example_spec.rb
CHANGED
@@ -1,23 +1,25 @@
|
|
1
1
|
require 'log_analysis'
|
2
2
|
require 'word_count'
|
3
|
+
require 'hive_like'
|
3
4
|
|
4
5
|
include HadoopDsl::LogAnalysis
|
5
6
|
describe 'Aapach Log Example' do
|
6
7
|
before(:all) do
|
7
|
-
@script = File.join(File.dirname(__FILE__), '..', 'examples', '
|
8
|
-
@
|
8
|
+
@script = File.join(File.dirname(__FILE__), '..', 'examples', 'log_analysis_test.rb')
|
9
|
+
@bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
10
|
+
@value = %Q!127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "-" "#{@bot_ua}"!
|
9
11
|
end
|
10
12
|
|
11
13
|
it 'can run example by mapper' do
|
12
14
|
mapper = LogAnalysisMapper.new(@script, nil, @value)
|
13
15
|
mapper.run
|
14
|
-
mapper.emitted.first
|
16
|
+
mapper.emitted.first.should == {"ua\t#{@bot_ua}" => 1}
|
15
17
|
end
|
16
18
|
|
17
19
|
it 'can run example by reducer' do
|
18
|
-
reducer = LogAnalysisReducer.new(@script, "
|
20
|
+
reducer = LogAnalysisReducer.new(@script, "ua\tChrome", [1, 1, 1])
|
19
21
|
reducer.run
|
20
|
-
reducer.emitted.first["
|
22
|
+
reducer.emitted.first["ua\tChrome"].should == 3
|
21
23
|
end
|
22
24
|
end
|
23
25
|
|
data/spec/hive_like_spec.rb
CHANGED
data/spec/log_analysis_spec.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require '
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
2
|
require 'log_analysis'
|
3
3
|
|
4
4
|
include HadoopDsl::LogAnalysis
|
@@ -39,13 +39,22 @@ describe LogAnalysisMapper do
|
|
39
39
|
mapper.column[:user].value.should == 'frank'
|
40
40
|
end
|
41
41
|
|
42
|
-
it 'should count uniq column' do
|
42
|
+
it 'should count uniq by column' do
|
43
43
|
value = 'count uniq'
|
44
44
|
mapper = LogAnalysisMapper.new(nil, nil, value)
|
45
45
|
mapper.separate(' ')
|
46
46
|
mapper.topic('t1') { mapper.count_uniq mapper.column[1] }
|
47
47
|
|
48
|
-
mapper.emitted.
|
48
|
+
mapper.emitted.should == [{"t1\tuniq" => 1}]
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should count uniq by value' do
|
52
|
+
value = 'count uniq'
|
53
|
+
mapper = LogAnalysisMapper.new(nil, nil, value)
|
54
|
+
mapper.separate(' ')
|
55
|
+
mapper.topic('t1') { mapper.count_uniq 'orig value' }
|
56
|
+
|
57
|
+
mapper.emitted.should == [{"t1\torig value" => 1}]
|
49
58
|
end
|
50
59
|
|
51
60
|
it 'should sum column value' do
|
@@ -83,6 +92,54 @@ describe LogAnalysisMapper do
|
|
83
92
|
topic = mapper.topic('desc with space')
|
84
93
|
topic.label.should == 'desc_with_space'
|
85
94
|
end
|
95
|
+
|
96
|
+
it 'can group date monthly' do
|
97
|
+
value = '2010/1/1 newyearday'
|
98
|
+
mapper = LogAnalysisMapper.new(nil, nil, value)
|
99
|
+
mapper.separate(' ')
|
100
|
+
mapper.column_name 'date', 'holiday'
|
101
|
+
|
102
|
+
['yearly', 'monthly', 'daily'].each do |term|
|
103
|
+
mapper.topic(term) do
|
104
|
+
mapper.group_date_by mapper.column[:date], term.to_sym
|
105
|
+
mapper.count_uniq mapper.column[:holiday]
|
106
|
+
end
|
107
|
+
end
|
108
|
+
mapper.emitted.should ==
|
109
|
+
[
|
110
|
+
{"yearly\t2010\tnewyearday" => 1},
|
111
|
+
{"monthly\t201001\tnewyearday" => 1},
|
112
|
+
{"daily\t20100101\tnewyearday" => 1}
|
113
|
+
]
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'can group by' do
|
117
|
+
value = '1 sub_2 bingo!'
|
118
|
+
mapper = LogAnalysisMapper.new(nil, nil, value)
|
119
|
+
mapper.separate(' ')
|
120
|
+
mapper.column_name 'id', 'sub_id', 'data'
|
121
|
+
|
122
|
+
mapper.topic('test') do
|
123
|
+
mapper.group_by mapper.column[:sub_id]
|
124
|
+
mapper.count_uniq mapper.column[:data]
|
125
|
+
end
|
126
|
+
mapper.emitted.should == [{"test\tsub_2\tbingo!" => 1}]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
Topic = LogAnalysisMapperModel::Topic
|
131
|
+
describe Topic do
|
132
|
+
it 'can get key with label' do
|
133
|
+
t = Topic.new('label')
|
134
|
+
t.key.should == 'label'
|
135
|
+
end
|
136
|
+
|
137
|
+
it 'can get key with label and elements' do
|
138
|
+
t = Topic.new('label')
|
139
|
+
t.key_elements << 'e1'
|
140
|
+
t.key_elements << 'e2'
|
141
|
+
t.key.should == "label\te1\te2"
|
142
|
+
end
|
86
143
|
end
|
87
144
|
|
88
145
|
describe LogAnalysisReducer do
|
data/spec/mapred_factory_spec.rb
CHANGED
@@ -1,31 +1,33 @@
|
|
1
1
|
require File.join(File.dirname(__FILE__) , 'spec_helper')
|
2
|
-
|
3
2
|
require 'mapred_factory'
|
4
|
-
require 'log_analysis'
|
5
3
|
|
6
|
-
|
4
|
+
include HadoopDsl
|
7
5
|
|
6
|
+
describe 'MapRed Factory' do
|
8
7
|
before(:all) do
|
9
|
-
@script = create_tmp_script("
|
8
|
+
@script = create_tmp_script("dsl 'LogAnalysis'")
|
10
9
|
end
|
11
10
|
|
12
11
|
it 'can create mapper' do
|
13
12
|
mapper = MapperFactory.create(@script, nil, nil)
|
14
|
-
mapper.class.should == LogAnalysisMapper
|
13
|
+
mapper.class.should == LogAnalysis::LogAnalysisMapper
|
15
14
|
end
|
16
15
|
|
17
16
|
it 'can create reducer' do
|
18
17
|
reducer = ReducerFactory.create(@script, nil, nil)
|
19
|
-
reducer.class.should == LogAnalysisReducer
|
18
|
+
reducer.class.should == LogAnalysis::LogAnalysisReducer
|
20
19
|
end
|
21
20
|
|
22
21
|
it 'can create setup' do
|
23
|
-
|
24
|
-
|
22
|
+
conf = mock('conf')
|
23
|
+
conf.should_receive(:output_key_class=).once
|
24
|
+
conf.should_receive(:output_value_class=).once
|
25
|
+
s = SetupFactory.create(create_tmp_script("dsl 'HiveLike'"), conf)
|
26
|
+
s.class.should == HiveLike::HiveLikeSetup
|
25
27
|
end
|
26
28
|
|
27
29
|
it 'can create base if not exists in specific DSL' do
|
28
|
-
s = SetupFactory.create(create_tmp_script("
|
30
|
+
s = SetupFactory.create(create_tmp_script("dsl 'WordCount'"), nil)
|
29
31
|
s.class.should == BaseSetup
|
30
32
|
end
|
31
33
|
|
@@ -37,6 +39,24 @@ describe 'MapRed Factory' do
|
|
37
39
|
it 'can convert dsl name to dsl lib file and require' do
|
38
40
|
dsl_name = MapRedFactory.dsl_name(@script)
|
39
41
|
MapRedFactory.require_dsl_lib(dsl_name).should_not be_nil
|
40
|
-
LogAnalysisMapper
|
42
|
+
LogAnalysis::LogAnalysisMapper
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'can create mapper if statement has double quote' do
|
46
|
+
script = create_tmp_script(%Q!dsl "LogAnalysis"!)
|
47
|
+
mapper = MapperFactory.create(script, nil, nil)
|
48
|
+
mapper.class.should == LogAnalysis::LogAnalysisMapper
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'can create mapper if exists more space' do
|
52
|
+
script = create_tmp_script(%Q! dsl "LogAnalysis" !)
|
53
|
+
mapper = MapperFactory.create(script, nil, nil)
|
54
|
+
mapper.class.should == LogAnalysis::LogAnalysisMapper
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'can create mapper if exists bracket' do
|
58
|
+
script = create_tmp_script(%Q! dsl ("LogAnalysis") !)
|
59
|
+
mapper = MapperFactory.create(script, nil, nil)
|
60
|
+
mapper.class.should == LogAnalysis::LogAnalysisMapper
|
41
61
|
end
|
42
62
|
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/util_spec.rb
CHANGED
@@ -1,19 +1,18 @@
|
|
1
1
|
require File.join(File.dirname(__FILE__) , 'spec_helper')
|
2
|
-
|
3
2
|
require 'util'
|
4
3
|
|
5
4
|
describe 'utilities' do
|
6
5
|
it 'can change camelcase str to snakecase' do
|
7
|
-
snake_case('CamelCaseStr').should == 'camel_case_str'
|
6
|
+
HadoopDsl.snake_case('CamelCaseStr').should == 'camel_case_str'
|
8
7
|
end
|
9
8
|
|
10
9
|
it 'can read file and get file data to string' do
|
11
10
|
script_body = 'This is a script body.'
|
12
11
|
@script = create_tmp_script(script_body)
|
13
|
-
read_file(@script).should == script_body
|
12
|
+
HadoopDsl.read_file(@script).should == script_body
|
14
13
|
end
|
15
14
|
|
16
15
|
it 'raise error if no file in loadpath' do
|
17
|
-
lambda { read_file('not_exists_on_loadpath') }.should raise_error
|
16
|
+
lambda { HadoopDsl.read_file('not_exists_on_loadpath') }.should raise_error
|
18
17
|
end
|
19
18
|
end
|
data/spec/word_count_spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hadoop-rubydsl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Koichi Fujikawa
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-01-
|
12
|
+
date: 2010-01-13 00:00:00 +09:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -32,27 +32,23 @@ extensions: []
|
|
32
32
|
|
33
33
|
extra_rdoc_files:
|
34
34
|
- README.rdoc
|
35
|
-
- TODO
|
36
35
|
files:
|
37
36
|
- .gitignore
|
38
37
|
- README.rdoc
|
39
38
|
- Rakefile
|
40
|
-
- TODO
|
41
39
|
- VERSION
|
42
40
|
- bin/hadoop-hudson.sh
|
43
41
|
- bin/hadoop-ruby.sh
|
44
42
|
- bin/hrd
|
45
43
|
- conf/hadoop-site.xml
|
46
|
-
- examples/apachelog-v2-2.rb
|
47
|
-
- examples/apachelog-v2.rb
|
48
|
-
- examples/apachelog.rb
|
49
44
|
- examples/hive_like_test.rb
|
45
|
+
- examples/log_analysis_test.rb
|
50
46
|
- examples/word_count_test.rb
|
51
47
|
- hadoop-rubydsl.gemspec
|
52
|
-
- lib/client.rb
|
53
48
|
- lib/core.rb
|
54
49
|
- lib/dsl_init.rb
|
55
|
-
- lib/
|
50
|
+
- lib/hadoop_dsl.rb
|
51
|
+
- lib/hadoop_dsl_client.rb
|
56
52
|
- lib/hive_like.rb
|
57
53
|
- lib/log_analysis.rb
|
58
54
|
- lib/mapred_factory.rb
|
@@ -97,8 +93,6 @@ test_files:
|
|
97
93
|
- spec/hive_like_spec.rb
|
98
94
|
- spec/log_analysis_spec.rb
|
99
95
|
- spec/example_spec.rb
|
100
|
-
- examples/apachelog-v2.rb
|
101
96
|
- examples/hive_like_test.rb
|
97
|
+
- examples/log_analysis_test.rb
|
102
98
|
- examples/word_count_test.rb
|
103
|
-
- examples/apachelog-v2-2.rb
|
104
|
-
- examples/apachelog.rb
|
data/TODO
DELETED
data/examples/apachelog-v2.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
use 'LogAnalysis'
|
2
|
-
|
3
|
-
data 'apache log on test1' do
|
4
|
-
from 'apachlog/inputs'
|
5
|
-
to 'apachlog/outputs'
|
6
|
-
|
7
|
-
each_line do
|
8
|
-
pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
|
9
|
-
column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes' # 各カラムにラベルをつける
|
10
|
-
|
11
|
-
topic 'which users?', :label => 'user' do
|
12
|
-
count_uniq column[:user]
|
13
|
-
end
|
14
|
-
|
15
|
-
# topic 'access date by monthly' do
|
16
|
-
# select_date column[:access_date], BY_MONTHLY
|
17
|
-
# count column[:access_date]
|
18
|
-
# end
|
19
|
-
#
|
20
|
-
# topic 'total bytes' do
|
21
|
-
# select_date column[:access_date], BY_MONTHLY
|
22
|
-
# sum column[:bytes].to_kilobytes # / 1024
|
23
|
-
# end
|
24
|
-
end
|
25
|
-
end
|
data/examples/apachelog.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
# Apache log analysis
|
2
|
-
#
|
3
|
-
# example target data:
|
4
|
-
# 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
|
5
|
-
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
|
6
|
-
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
|
7
|
-
|
8
|
-
use 'LogAnalysis'
|
9
|
-
|
10
|
-
data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
|
11
|
-
column[2].count_uniq
|
12
|
-
column[3].count_uniq
|
13
|
-
column[4].count_uniq
|
14
|
-
column[5].count_uniq
|
15
|
-
column[6].sum
|