hadoop-rubydsl 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
data/bin/hrd CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'hadoop-dsl'
3
+ require 'hadoop_dsl_client'
4
4
 
5
5
  HadoopDsl::Client.new(ARGV).run
@@ -1,4 +1,4 @@
1
- use 'HiveLike'
1
+ dsl 'HiveLike'
2
2
 
3
3
  # hive-like/items.txt
4
4
  # apple, 3, 100
@@ -1,4 +1,4 @@
1
- use 'LogAnalysis'
1
+ dsl 'LogAnalysis'
2
2
 
3
3
  data 'apache log on test2' do
4
4
  from 'apachelog/inputs'
@@ -14,5 +14,28 @@ data 'apache log on test2' do
14
14
  topic 'ua counts', :label => 'ua' do
15
15
  count_uniq column[:ua]
16
16
  end
17
+
18
+ topic 'count bot', :label => 'bot' do
19
+ ua = column[:ua].value
20
+ bot = ua if ua =~ /bot/i
21
+ count_uniq bot
22
+ end
23
+
24
+ topic 'ua counts group by path' do
25
+ request = column[:request].value
26
+ path = request.split(/\s+/)[1]
27
+ group_by path
28
+ count_uniq column[:ua]
29
+ end
30
+
31
+ topic 'ua counts by daily' do
32
+ group_date_by column[:access_date], :daily
33
+ count_uniq column[:ua]
34
+ end
35
+
36
+ # topic 'total bytes' do
37
+ # select_date column[:access_date], BY_MONTHLY
38
+ # sum column[:bytes].to_kilobytes # / 1024
39
+ # end
17
40
  end
18
41
  end
@@ -1,4 +1,4 @@
1
- use 'WordCount'
1
+ dsl 'WordCount'
2
2
 
3
3
  from 'wc/inputs'
4
4
  to 'wc/outputs'
@@ -5,38 +5,34 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{hadoop-rubydsl}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Koichi Fujikawa"]
12
- s.date = %q{2010-01-04}
12
+ s.date = %q{2010-01-13}
13
13
  s.description = %q{Hadoop Ruby DSL}
14
14
  s.email = %q{fujibee@gmail.com}
15
15
  s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
16
16
  s.extra_rdoc_files = [
17
- "README.rdoc",
18
- "TODO"
17
+ "README.rdoc"
19
18
  ]
20
19
  s.files = [
21
20
  ".gitignore",
22
21
  "README.rdoc",
23
22
  "Rakefile",
24
- "TODO",
25
23
  "VERSION",
26
24
  "bin/hadoop-hudson.sh",
27
25
  "bin/hadoop-ruby.sh",
28
26
  "bin/hrd",
29
27
  "conf/hadoop-site.xml",
30
- "examples/apachelog-v2-2.rb",
31
- "examples/apachelog-v2.rb",
32
- "examples/apachelog.rb",
33
28
  "examples/hive_like_test.rb",
29
+ "examples/log_analysis_test.rb",
34
30
  "examples/word_count_test.rb",
35
31
  "hadoop-rubydsl.gemspec",
36
- "lib/client.rb",
37
32
  "lib/core.rb",
38
33
  "lib/dsl_init.rb",
39
- "lib/hadoop-dsl.rb",
34
+ "lib/hadoop_dsl.rb",
35
+ "lib/hadoop_dsl_client.rb",
40
36
  "lib/hive_like.rb",
41
37
  "lib/log_analysis.rb",
42
38
  "lib/mapred_factory.rb",
@@ -59,11 +55,9 @@ Gem::Specification.new do |s|
59
55
  "spec/hive_like_spec.rb",
60
56
  "spec/log_analysis_spec.rb",
61
57
  "spec/example_spec.rb",
62
- "examples/apachelog-v2.rb",
63
58
  "examples/hive_like_test.rb",
64
- "examples/word_count_test.rb",
65
- "examples/apachelog-v2-2.rb",
66
- "examples/apachelog.rb"
59
+ "examples/log_analysis_test.rb",
60
+ "examples/word_count_test.rb"
67
61
  ]
68
62
 
69
63
  if s.respond_to? :specification_version then
@@ -1,10 +1,33 @@
1
- require 'util'
1
+ require 'hadoop_dsl'
2
2
  require 'forwardable'
3
3
 
4
4
  module HadoopDsl
5
+ # common
6
+ module DslElement
7
+ # all DSL statements without def is processed here
8
+ def method_missing(method_name, *args)
9
+ yield if block_given?
10
+ self
11
+ end
12
+ end
13
+
5
14
  # controller
15
+ module DslController
16
+ include DslElement
17
+
18
+ def run
19
+ body = pre_process(HadoopDsl.read_file(@script))
20
+ eval(body, binding, @script)
21
+ end
22
+
23
+ def pre_process(body)
24
+ body # do nothing
25
+ end
26
+ end
27
+
6
28
  class BaseMapRed
7
29
  extend Forwardable
30
+ include DslController
8
31
 
9
32
  attr_reader :emitted
10
33
 
@@ -14,66 +37,54 @@ module HadoopDsl
14
37
  @emitted = []
15
38
  end
16
39
 
17
- def run
18
- body = pre_process(read_file(@script))
19
- eval(body, binding, @script)
20
- end
21
-
22
- def pre_process(body)
23
- body # do nothing
24
- end
25
-
26
40
  def emit(hash) @emitted << hash end
27
41
 
28
- # all DSL statements without def is processed here
29
- def method_missing(method_name, *args) self end
42
+ private
43
+ def key; @model.key end
30
44
  end
31
45
 
32
46
  class BaseSetup
47
+ include DslController
48
+
33
49
  def initialize(script, conf)
34
50
  @script, @conf = script, conf
35
51
  output_format
36
52
  end
37
53
 
38
- def run
39
- body = pre_process(read_file(@script))
40
- eval(body, binding, @script)
41
- end
42
-
43
- def pre_process(body)
44
- body # do nothing
45
- end
46
-
47
- # do nothing
48
- def output_format; end
49
-
54
+ def output_format; end # do nothing
50
55
  def paths; [@from, @to] end
51
-
52
56
  def from(path) @from = path end
53
57
  def to(path) @to = path end
54
-
55
- # all DSL statements without def is processed here
56
- def method_missing(method_name, *args) self end
57
58
  end
58
59
 
59
60
  class BaseMapper < BaseMapRed
60
- def initialize(script, model)
61
- super(script, model)
61
+ # common functions
62
+ def identity
63
+ emit(@model.key => @model.value)
62
64
  end
65
+
66
+ private
67
+ def value; @model.values end
63
68
  end
64
69
 
65
70
  class BaseReducer < BaseMapRed
66
- def initialize(script, model)
67
- super(script, model)
71
+ # common functions
72
+ def aggregate
73
+ emit(@model.key => @model.values.inject {|ret, i| ret + i})
74
+ end
75
+
76
+ def identity
77
+ @model.values.each {|v| emit(@model.key => v)}
68
78
  end
79
+
80
+ private
81
+ def values; @model.values end
69
82
  end
70
83
 
71
84
  # model
72
85
  class BaseModel
86
+ include DslElement
73
87
  attr_accessor :controller
74
-
75
- # all DSL statements without def is processed here
76
- def method_missing(method_name, *args) self end
77
88
  end
78
89
 
79
90
  class BaseMapperModel < BaseModel
@@ -82,11 +93,6 @@ module HadoopDsl
82
93
  def initialize(key, value)
83
94
  @key, @value = key, value
84
95
  end
85
-
86
- # common functions
87
- def identity
88
- @controller.emit(@key => @value)
89
- end
90
96
  end
91
97
 
92
98
  class BaseReducerModel < BaseModel
@@ -95,14 +101,5 @@ module HadoopDsl
95
101
  def initialize(key, values)
96
102
  @key, @values = key, values
97
103
  end
98
-
99
- # common functions
100
- def aggregate
101
- @controller.emit(@key => @values.inject {|ret, i| ret + i})
102
- end
103
-
104
- def identity
105
- @values.each {|v| @controller.emit(@key => v)}
106
- end
107
104
  end
108
105
  end
@@ -1,16 +1,7 @@
1
- require 'core'
2
- require 'java'
3
- require 'mapred_factory'
4
-
5
- import 'org.apache.hadoop.io.IntWritable'
6
- import 'org.apache.hadoop.io.Text'
1
+ require 'hadoop_dsl'
7
2
 
8
3
  include HadoopDsl
9
4
 
10
- # Hadoop IO types
11
- HadoopDsl::Text = Text
12
- HadoopDsl::IntWritable = IntWritable
13
-
14
5
  def map(key, value, output, reporter, script)
15
6
  mapper = MapperFactory.create(script, key, value)
16
7
  mapper.run
@@ -0,0 +1,14 @@
1
+ require 'util'
2
+ require 'mapred_factory'
3
+ require 'core'
4
+
5
+ # for jruby
6
+ if defined? JRUBY_VERSION
7
+ require 'java'
8
+ import 'org.apache.hadoop.io.IntWritable'
9
+ import 'org.apache.hadoop.io.Text'
10
+
11
+ # Hadoop IO types
12
+ HadoopDsl::Text = Text
13
+ HadoopDsl::IntWritable = IntWritable
14
+ end
@@ -1,4 +1,14 @@
1
+ require 'jruby-on-hadoop'
2
+
1
3
  module HadoopDsl
4
+ def self.lib_path
5
+ File.expand_path(File.dirname(__FILE__))
6
+ end
7
+
8
+ def self.dsl_init_script
9
+ File.join(lib_path, "dsl_init.rb")
10
+ end
11
+
2
12
  class Client < JRubyOnHadoop::Client
3
13
  def parse_args
4
14
  super
@@ -1,11 +1,6 @@
1
- require 'core'
2
- require 'enumerator'
1
+ require 'hadoop_dsl'
3
2
 
4
3
  module HadoopDsl::HiveLike
5
- include HadoopDsl
6
-
7
- AVAILABLE_METHODS = [:select, :create_table, :table]
8
-
9
4
  # common
10
5
  module HiveLikeMapRed
11
6
  def pre_process(body)
@@ -17,7 +12,7 @@ module HadoopDsl::HiveLike
17
12
  args = sprit_and_marge_args($2)
18
13
  processed << "#{method}(#{args})\n"
19
14
  else
20
- processed << line + "\n"
15
+ processed << line + "\n" if line
21
16
  end
22
17
  end
23
18
  processed
@@ -32,15 +27,15 @@ module HadoopDsl::HiveLike
32
27
  end
33
28
 
34
29
  # controller
35
- class HiveLikeSetup < BaseSetup
30
+ class HiveLikeSetup < HadoopDsl::BaseSetup
36
31
  def load_data(inputs, table)
37
32
  @from = inputs
38
33
  @to = inputs.gsub(/#{File.basename(inputs)}$/, 'outputs')
39
34
  end
40
35
 
41
36
  def output_format
42
- @conf.output_key_class = Text
43
- @conf.output_value_class = Text
37
+ @conf.output_key_class = HadoopDsl::Text
38
+ @conf.output_value_class = HadoopDsl::Text
44
39
  end
45
40
 
46
41
  # might not need but occur error if not exists
@@ -49,37 +44,43 @@ module HadoopDsl::HiveLike
49
44
  include HiveLikeMapRed
50
45
  end
51
46
 
52
- class HiveLikeMapper < BaseMapper
47
+ class HiveLikeMapper < HadoopDsl::BaseMapper
53
48
  def initialize(script, key, value)
54
49
  super(script, HiveLikeMapperModel.new(key, value))
55
50
  end
56
51
 
57
52
  include HiveLikeMapRed
58
53
 
59
- # model methods
60
- def_delegators :@model, *AVAILABLE_METHODS
54
+ def_delegators :@model, :create_table, :table
55
+
56
+ # emitters
57
+ def select(*args)
58
+ from_index = args.index('from')
59
+ if from_index
60
+ values = args[0...from_index].map do |column|
61
+ splitted = @model.value.split(/[,\s]+/)
62
+ splitted[@model.table.columns.index(column)]
63
+ end
64
+ emit(args[from_index + 1] => values.join(", "))
65
+ end
66
+ end
61
67
  end
62
68
 
63
- class HiveLikeReducer < BaseReducer
69
+ class HiveLikeReducer < HadoopDsl::BaseReducer
64
70
  def initialize(script, key, values)
65
71
  super(script, HiveLikeReducerModel.new(key, values))
66
72
  end
67
73
 
68
74
  include HiveLikeMapRed
69
75
 
70
- # model methods
71
- def_delegators :@model, *AVAILABLE_METHODS
76
+ # emitters
77
+ def select(*args) identity end
72
78
  end
73
79
 
74
80
  # model
75
- class HiveLikeMapperModel < BaseMapperModel
81
+ class HiveLikeMapperModel < HadoopDsl::BaseMapperModel
76
82
  attr_reader :table
77
83
 
78
- def initialize(key, value)
79
- super(key, value)
80
- end
81
-
82
- # emitters
83
84
  def create_table(name, *column_and_type)
84
85
  @table = Table.new(name)
85
86
  column_and_type.each_with_index do |column, index|
@@ -88,17 +89,6 @@ module HadoopDsl::HiveLike
88
89
  end
89
90
  end
90
91
 
91
- def select(*args)
92
- from_index = args.index('from')
93
- if from_index
94
- values = args[0...from_index].map do |column|
95
- splitted = @value.split(/[,\s]+/)
96
- splitted[@table.columns.index(column)]
97
- end
98
- @controller.emit(args[from_index + 1] => values.join(", "))
99
- end
100
- end
101
-
102
92
  class Table
103
93
  attr_reader :name, :columns
104
94
 
@@ -111,12 +101,6 @@ module HadoopDsl::HiveLike
111
101
  end
112
102
  end
113
103
 
114
- class HiveLikeReducerModel < BaseReducerModel
115
- def initialize(key, values)
116
- super(key, values)
117
- end
118
-
119
- # emitters
120
- def select(*args) identity end
104
+ class HiveLikeReducerModel < HadoopDsl::BaseReducerModel
121
105
  end
122
106
  end
@@ -1,55 +1,123 @@
1
- require 'core'
1
+ require 'hadoop_dsl'
2
2
  require 'enumerator'
3
3
 
4
4
  module HadoopDsl::LogAnalysis
5
- include HadoopDsl
6
-
7
5
  KEY_SEP = "\t"
8
6
  PREFIX = 'col'
9
7
  PASS = nil
10
- AVAILABLE_METHODS = [:separate, :pattern, :column_name, :column, :topic, :value, :count_uniq, :sum]
8
+ MODEL_METHODS = [:column, :value]
11
9
 
12
- # common
13
- module LogAnalysisMapRed
14
- # entry point
15
- def data(description = '', &block) yield end
10
+ # controller
11
+ class LogAnalysisMapper < HadoopDsl::BaseMapper
12
+ def initialize(script, key, value)
13
+ super(script, LogAnalysisMapperModel.new(key, value))
14
+ end
16
15
 
17
- def each_line(&block) yield end
18
- end
16
+ # model methods
17
+ def_delegators :@model, *MODEL_METHODS
18
+
19
+ def topic(desc, options = {}, &block)
20
+ @model.create_topic(desc, options)
21
+ yield if block_given?
22
+ current_topic
23
+ end
19
24
 
20
- # controller
21
- class LogAnalysisSetup < BaseSetup
22
- def initialize(script, conf)
23
- super(script, conf)
25
+ def separate(sep)
26
+ parts = value.split(sep)
27
+ @model.create_or_replace_columns_with(parts) {|column, value| column.value = value}
24
28
  end
25
29
 
26
- include LogAnalysisMapRed
27
- end
30
+ def pattern(re)
31
+ if value =~ re
32
+ md = Regexp.last_match
33
+ @model.create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
34
+ end
35
+ end
28
36
 
29
- class LogAnalysisMapper < BaseMapper
30
- def initialize(script, key, value)
31
- super(script, LogAnalysisMapperModel.new(key, value))
37
+ # column names by String converted to Symbol
38
+ def column_name(*names)
39
+ sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
40
+ @model.create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
32
41
  end
33
42
 
34
- include LogAnalysisMapRed
43
+ def group_by(column_or_value)
44
+ case column_or_value
45
+ when LogAnalysisMapperModel::Column
46
+ column = column_or_value
47
+ current_topic.key_elements << column.value
48
+ else
49
+ value = column_or_value
50
+ current_topic.key_elements << value
51
+ end
52
+ end
35
53
 
36
- # model methods
37
- def_delegators :@model, *AVAILABLE_METHODS
54
+ def group_date_by(column, term)
55
+ require 'time'
56
+ time = parse_time(column.value)
57
+ time_key = case term
58
+ when :daily then time.strftime('%Y%m%d')
59
+ when :monthly then time.strftime('%Y%m')
60
+ when :yearly then time.strftime('%Y')
61
+ end
62
+ current_topic.key_elements << time_key
63
+ end
64
+
65
+ # emitters
66
+ def count_uniq(column_or_value)
67
+ uniq_key =
68
+ case column_or_value
69
+ when LogAnalysisMapperModel::Column
70
+ column = column_or_value
71
+ column.value
72
+ else column_or_value # value
73
+ end
74
+ current_topic.key_elements << uniq_key
75
+ emit(current_topic.key => 1)
76
+ end
77
+
78
+ def sum(column)
79
+ emit(current_topic.key => column.value.to_i)
80
+ end
81
+
82
+ private
83
+ def current_topic; @model.current_topic end
84
+
85
+ def parse_time(str)
86
+ begin Time.parse(str)
87
+ rescue
88
+ # apachelog pattern ex) "10/Oct/2000:13:55:36 -0700"
89
+ Time.parse($1) if str =~ /^(\d*\/\w*\/\d*):/
90
+ end
91
+ end
38
92
  end
39
93
 
40
- class LogAnalysisReducer < BaseReducer
94
+ class LogAnalysisReducer < HadoopDsl::BaseReducer
41
95
  def initialize(script, key, values)
42
96
  super(script, LogAnalysisReducerModel.new(key, values))
43
97
  end
44
98
 
45
- include LogAnalysisMapRed
46
-
47
99
  # model methods
48
- def_delegators :@model, *AVAILABLE_METHODS
100
+ def_delegators :@model, *MODEL_METHODS
101
+
102
+ def topic(desc, options = {}, &block)
103
+ @model.create_topic(desc, options)
104
+ yield if block_given?
105
+ @model.current_topic
106
+ end
107
+
108
+ def count_uniq(column)
109
+ aggregate if @model.topic == @model.current_topic
110
+ end
111
+
112
+ def sum(column)
113
+ aggregate if @model.topic == @model.current_topic
114
+ end
49
115
  end
50
116
 
51
117
  # model
52
- class LogAnalysisMapperModel < BaseMapperModel
118
+ class LogAnalysisMapperModel < HadoopDsl::BaseMapperModel
119
+ attr_reader :current_topic
120
+
53
121
  def initialize(key, value)
54
122
  super(key, value)
55
123
  @columns = ColumnArray.new
@@ -58,28 +126,8 @@ module HadoopDsl::LogAnalysis
58
126
 
59
127
  def column; @columns end
60
128
 
61
- def topic(desc, options = {}, &block)
129
+ def create_topic(desc, options)
62
130
  @topics << @current_topic = Topic.new(desc, options[:label])
63
- yield if block_given?
64
- @current_topic
65
- end
66
-
67
- def separate(sep)
68
- parts = @value.split(sep)
69
- create_or_replace_columns_with(parts) {|column, value| column.value = value}
70
- end
71
-
72
- def pattern(re)
73
- if @value =~ re
74
- md = Regexp.last_match
75
- create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
76
- end
77
- end
78
-
79
- # column names by String converted to Symbol
80
- def column_name(*names)
81
- sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
82
- create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
83
131
  end
84
132
 
85
133
  def create_or_replace_columns_with(array, &block)
@@ -91,15 +139,6 @@ module HadoopDsl::LogAnalysis
91
139
  @columns = ColumnArray.new(columns)
92
140
  end
93
141
 
94
- # emitters
95
- def count_uniq(column)
96
- @controller.emit([@current_topic.label, KEY_SEP, column.value].join => 1)
97
- end
98
-
99
- def sum(column)
100
- @controller.emit([@current_topic.label].join => column.value.to_i)
101
- end
102
-
103
142
  class ColumnArray < Array
104
143
  def [](key)
105
144
  case key
@@ -120,17 +159,28 @@ module HadoopDsl::LogAnalysis
120
159
  end
121
160
 
122
161
  class Topic
162
+ attr_reader :key_elements
163
+
123
164
  def initialize(desc, label = nil)
124
165
  @desc, @label = desc, label
166
+ @key_elements = []
125
167
  end
126
168
 
127
169
  def label
128
170
  @label || @desc.gsub(/\s/, '_')
129
171
  end
172
+
173
+ def key
174
+ without_label =
175
+ @key_elements.size > 0 ? @key_elements.join(KEY_SEP) : nil
176
+ [label, without_label].compact.join(KEY_SEP)
177
+ end
130
178
  end
131
179
  end
132
180
 
133
- class LogAnalysisReducerModel < BaseReducerModel
181
+ class LogAnalysisReducerModel < HadoopDsl::BaseReducerModel
182
+ attr_reader :topic, :current_topic
183
+
134
184
  def initialize(key, values)
135
185
  super(key, values)
136
186
  if key =~ /(\w*)#{KEY_SEP}?(.*)/
@@ -138,18 +188,8 @@ module HadoopDsl::LogAnalysis
138
188
  end
139
189
  end
140
190
 
141
- def topic(desc, options = {}, &block)
191
+ def create_topic(desc, options)
142
192
  @current_topic = Topic.new(options[:label] || desc.gsub(/\s/, '_'), nil)
143
- yield if block_given?
144
- @current_topic
145
- end
146
-
147
- def count_uniq(column)
148
- aggregate if @topic == @current_topic
149
- end
150
-
151
- def sum(column)
152
- aggregate if @topic == @current_topic
153
193
  end
154
194
 
155
195
  class Topic
@@ -1,16 +1,16 @@
1
- require 'util'
1
+ require 'hadoop_dsl'
2
2
 
3
3
  module HadoopDsl
4
4
  class MapRedFactory
5
5
  def self.dsl_name(script)
6
- read_file(script).each_line do |line|
7
- dsl_name = $1 if line =~ /^use\s*'(\w*)'/
8
- return dsl_name
6
+ HadoopDsl.read_file(script).each_line do |line|
7
+ dsl_name = $1 if line =~ /\s*dsl\s*\(?["'](\w*)["']\)?/
8
+ return dsl_name if dsl_name
9
9
  end
10
10
  end
11
11
 
12
12
  def self.require_dsl_lib(dsl_name)
13
- require snake_case(dsl_name)
13
+ require HadoopDsl.snake_case(dsl_name)
14
14
  end
15
15
  end
16
16
 
@@ -1,18 +1,18 @@
1
1
  # utility functions
2
+ require 'hadoop_dsl'
2
3
 
3
4
  module HadoopDsl
4
- def snake_case(str)
5
+ def self.snake_case(str)
5
6
  str.gsub(/\B[A-Z]/, '_\&').downcase
6
7
  end
7
8
 
8
- def read_file(file_name)
9
+ def self.read_file(file_name)
9
10
  # read as usual
10
11
  body = File.open(file_name).read rescue nil
11
12
  return body if body
12
13
 
13
14
  # read from loadpath
14
15
  $:.each do |path|
15
- p path
16
16
  body = File.open(File.join(path, file_name)).read rescue next
17
17
  return body if body
18
18
  end
@@ -1,76 +1,56 @@
1
- require 'core'
1
+ require 'hadoop_dsl'
2
2
  require 'enumerator'
3
3
 
4
4
  module HadoopDsl::WordCount
5
- include HadoopDsl
6
-
7
- AVAILABLE_METHODS = [:count_uniq, :total]
5
+ MODEL_METHODS = []
8
6
  TOTAL_PREFIX = "\t"
9
7
 
10
- # common
11
- module WordCountMapRed
12
- # entry point
13
- def data(description = '', &block) yield end
14
- end
15
-
16
8
  # controller
17
- class WordCountMapper < BaseMapper
9
+ class WordCountMapper < HadoopDsl::BaseMapper
18
10
  def initialize(script, key, value)
19
11
  super(script, WordCountMapperModel.new(key, value))
20
12
  end
21
13
 
22
- include WordCountMapRed
23
-
24
- # model methods
25
- def_delegators :@model, *AVAILABLE_METHODS
26
- end
27
-
28
- class WordCountReducer < BaseReducer
29
- def initialize(script, key, values)
30
- super(script, WordCountReducerModel.new(key, values))
31
- end
32
-
33
- include WordCountMapRed
34
-
35
14
  # model methods
36
- def_delegators :@model, *AVAILABLE_METHODS
37
- end
38
-
39
- # model
40
- class WordCountMapperModel < BaseMapperModel
41
- def initialize(key, value)
42
- super(key, value)
43
- end
15
+ def_delegators :@model, *MODEL_METHODS
44
16
 
45
17
  # emitters
46
18
  def count_uniq
47
- @value.split.each {|word| @controller.emit(word => 1)}
19
+ @model.value.split.each {|word| emit(word => 1)}
48
20
  end
49
21
 
50
22
  def total(*types)
51
23
  types.each do |type|
52
24
  case type
53
25
  when :bytes
54
- @controller.emit("#{TOTAL_PREFIX}total bytes" => @value.gsub(/\s/, '').length)
26
+ emit("#{TOTAL_PREFIX}total bytes" => @model.value.gsub(/\s/, '').length)
55
27
  when :words
56
- @controller.emit("#{TOTAL_PREFIX}total words" => @value.split.size)
28
+ emit("#{TOTAL_PREFIX}total words" => @model.value.split.size)
57
29
  when :lines
58
- @controller.emit("#{TOTAL_PREFIX}total lines" => 1)
30
+ emit("#{TOTAL_PREFIX}total lines" => 1)
59
31
  end
60
32
  end
61
33
  end
62
34
  end
63
35
 
64
- class WordCountReducerModel < BaseReducerModel
65
- def initialize(key, values)
66
- super(key, values)
36
+ class WordCountReducer < HadoopDsl::BaseReducer
37
+ def initialize(script, key, values)
38
+ super(script, WordCountReducerModel.new(key, values))
67
39
  end
68
40
 
41
+ # model methods
42
+ def_delegators :@model, *MODEL_METHODS
43
+
69
44
  # emitters
70
- def count_uniq; aggregate unless total_value? end
71
- def total(*types); aggregate if total_value? end
45
+ def count_uniq; aggregate unless @model.total_value? end
46
+ def total(*types); aggregate if @model.total_value? end
47
+ end
48
+
49
+ # model
50
+ class WordCountMapperModel < HadoopDsl::BaseMapperModel
51
+ end
72
52
 
73
- private
53
+ class WordCountReducerModel < HadoopDsl::BaseReducerModel
74
54
  def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
75
55
  end
76
56
  end
@@ -1,4 +1,5 @@
1
- require 'hadoop-dsl'
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+ require 'hadoop_dsl_client'
2
3
 
3
4
  describe HadoopDsl::Client do
4
5
  before do
@@ -1,4 +1,4 @@
1
- require 'dsl_init'
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
2
  require 'core'
3
3
 
4
4
  include HadoopDsl
@@ -38,7 +38,7 @@ to 'test/outputs'
38
38
  it 'can emit as identity' do
39
39
  model = BaseMapperModel.new('key', 'value')
40
40
  mapper = BaseMapper.new(@script, model)
41
- model.identity
41
+ mapper.identity
42
42
 
43
43
  mapper.emitted.should == [{'key' => 'value'}]
44
44
  end
@@ -48,7 +48,7 @@ to 'test/outputs'
48
48
  it 'can emit as aggregate' do
49
49
  model = BaseReducerModel.new('key', [1, 2, 3])
50
50
  reducer = BaseReducer.new(@script, model)
51
- model.aggregate
51
+ reducer.aggregate
52
52
 
53
53
  reducer.emitted.should == [{'key' => 6}]
54
54
  end
@@ -56,7 +56,7 @@ to 'test/outputs'
56
56
  it 'can emit as identity' do
57
57
  model = BaseReducerModel.new('key', [1, 2, 3])
58
58
  reducer = BaseReducer.new(@script, model)
59
- model.identity
59
+ reducer.identity
60
60
 
61
61
  reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}]
62
62
  end
@@ -4,7 +4,7 @@ describe 'mapreduce init' do
4
4
 
5
5
  before(:all) do
6
6
  @script = create_tmp_script(<<-EOF)
7
- use 'LogAnalysis'
7
+ dsl 'LogAnalysis'
8
8
  data 'test' do
9
9
  from 'test/inputs'
10
10
  to 'test/outputs'
@@ -1,23 +1,25 @@
1
1
  require 'log_analysis'
2
2
  require 'word_count'
3
+ require 'hive_like'
3
4
 
4
5
  include HadoopDsl::LogAnalysis
5
6
  describe 'Aapach Log Example' do
6
7
  before(:all) do
7
- @script = File.join(File.dirname(__FILE__), '..', 'examples', 'apachelog-v2.rb')
8
- @value = '127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326'
8
+ @script = File.join(File.dirname(__FILE__), '..', 'examples', 'log_analysis_test.rb')
9
+ @bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
10
+ @value = %Q!127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "-" "#{@bot_ua}"!
9
11
  end
10
12
 
11
13
  it 'can run example by mapper' do
12
14
  mapper = LogAnalysisMapper.new(@script, nil, @value)
13
15
  mapper.run
14
- mapper.emitted.first["user\tfrank"].should == 1
16
+ mapper.emitted.first.should == {"ua\t#{@bot_ua}" => 1}
15
17
  end
16
18
 
17
19
  it 'can run example by reducer' do
18
- reducer = LogAnalysisReducer.new(@script, "user\tfrank", [1, 1, 1])
20
+ reducer = LogAnalysisReducer.new(@script, "ua\tChrome", [1, 1, 1])
19
21
  reducer.run
20
- reducer.emitted.first["user\tfrank"].should == 3
22
+ reducer.emitted.first["ua\tChrome"].should == 3
21
23
  end
22
24
  end
23
25
 
@@ -1,4 +1,4 @@
1
- require 'core'
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
2
  require 'hive_like'
3
3
 
4
4
  include HadoopDsl::HiveLike
@@ -1,4 +1,4 @@
1
- require 'core'
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
2
  require 'log_analysis'
3
3
 
4
4
  include HadoopDsl::LogAnalysis
@@ -39,13 +39,22 @@ describe LogAnalysisMapper do
39
39
  mapper.column[:user].value.should == 'frank'
40
40
  end
41
41
 
42
- it 'should count uniq column' do
42
+ it 'should count uniq by column' do
43
43
  value = 'count uniq'
44
44
  mapper = LogAnalysisMapper.new(nil, nil, value)
45
45
  mapper.separate(' ')
46
46
  mapper.topic('t1') { mapper.count_uniq mapper.column[1] }
47
47
 
48
- mapper.emitted.first["t1\tuniq"].should == 1
48
+ mapper.emitted.should == [{"t1\tuniq" => 1}]
49
+ end
50
+
51
+ it 'should count uniq by value' do
52
+ value = 'count uniq'
53
+ mapper = LogAnalysisMapper.new(nil, nil, value)
54
+ mapper.separate(' ')
55
+ mapper.topic('t1') { mapper.count_uniq 'orig value' }
56
+
57
+ mapper.emitted.should == [{"t1\torig value" => 1}]
49
58
  end
50
59
 
51
60
  it 'should sum column value' do
@@ -83,6 +92,54 @@ describe LogAnalysisMapper do
83
92
  topic = mapper.topic('desc with space')
84
93
  topic.label.should == 'desc_with_space'
85
94
  end
95
+
96
+ it 'can group date monthly' do
97
+ value = '2010/1/1 newyearday'
98
+ mapper = LogAnalysisMapper.new(nil, nil, value)
99
+ mapper.separate(' ')
100
+ mapper.column_name 'date', 'holiday'
101
+
102
+ ['yearly', 'monthly', 'daily'].each do |term|
103
+ mapper.topic(term) do
104
+ mapper.group_date_by mapper.column[:date], term.to_sym
105
+ mapper.count_uniq mapper.column[:holiday]
106
+ end
107
+ end
108
+ mapper.emitted.should ==
109
+ [
110
+ {"yearly\t2010\tnewyearday" => 1},
111
+ {"monthly\t201001\tnewyearday" => 1},
112
+ {"daily\t20100101\tnewyearday" => 1}
113
+ ]
114
+ end
115
+
116
+ it 'can group by' do
117
+ value = '1 sub_2 bingo!'
118
+ mapper = LogAnalysisMapper.new(nil, nil, value)
119
+ mapper.separate(' ')
120
+ mapper.column_name 'id', 'sub_id', 'data'
121
+
122
+ mapper.topic('test') do
123
+ mapper.group_by mapper.column[:sub_id]
124
+ mapper.count_uniq mapper.column[:data]
125
+ end
126
+ mapper.emitted.should == [{"test\tsub_2\tbingo!" => 1}]
127
+ end
128
+ end
129
+
130
+ Topic = LogAnalysisMapperModel::Topic
131
+ describe Topic do
132
+ it 'can get key with label' do
133
+ t = Topic.new('label')
134
+ t.key.should == 'label'
135
+ end
136
+
137
+ it 'can get key with label and elements' do
138
+ t = Topic.new('label')
139
+ t.key_elements << 'e1'
140
+ t.key_elements << 'e2'
141
+ t.key.should == "label\te1\te2"
142
+ end
86
143
  end
87
144
 
88
145
  describe LogAnalysisReducer do
@@ -1,31 +1,33 @@
1
1
  require File.join(File.dirname(__FILE__) , 'spec_helper')
2
-
3
2
  require 'mapred_factory'
4
- require 'log_analysis'
5
3
 
6
- describe 'MapRed Factory' do
4
+ include HadoopDsl
7
5
 
6
+ describe 'MapRed Factory' do
8
7
  before(:all) do
9
- @script = create_tmp_script("use 'LogAnalysis'")
8
+ @script = create_tmp_script("dsl 'LogAnalysis'")
10
9
  end
11
10
 
12
11
  it 'can create mapper' do
13
12
  mapper = MapperFactory.create(@script, nil, nil)
14
- mapper.class.should == LogAnalysisMapper
13
+ mapper.class.should == LogAnalysis::LogAnalysisMapper
15
14
  end
16
15
 
17
16
  it 'can create reducer' do
18
17
  reducer = ReducerFactory.create(@script, nil, nil)
19
- reducer.class.should == LogAnalysisReducer
18
+ reducer.class.should == LogAnalysis::LogAnalysisReducer
20
19
  end
21
20
 
22
21
  it 'can create setup' do
23
- s = SetupFactory.create(@script, nil)
24
- s.class.should == LogAnalysisSetup
22
+ conf = mock('conf')
23
+ conf.should_receive(:output_key_class=).once
24
+ conf.should_receive(:output_value_class=).once
25
+ s = SetupFactory.create(create_tmp_script("dsl 'HiveLike'"), conf)
26
+ s.class.should == HiveLike::HiveLikeSetup
25
27
  end
26
28
 
27
29
  it 'can create base if not exists in specific DSL' do
28
- s = SetupFactory.create(create_tmp_script("use 'WordCount'"), nil)
30
+ s = SetupFactory.create(create_tmp_script("dsl 'WordCount'"), nil)
29
31
  s.class.should == BaseSetup
30
32
  end
31
33
 
@@ -37,6 +39,24 @@ describe 'MapRed Factory' do
37
39
  it 'can convert dsl name to dsl lib file and require' do
38
40
  dsl_name = MapRedFactory.dsl_name(@script)
39
41
  MapRedFactory.require_dsl_lib(dsl_name).should_not be_nil
40
- LogAnalysisMapper
42
+ LogAnalysis::LogAnalysisMapper
43
+ end
44
+
45
+ it 'can create mapper if statement has double quote' do
46
+ script = create_tmp_script(%Q!dsl "LogAnalysis"!)
47
+ mapper = MapperFactory.create(script, nil, nil)
48
+ mapper.class.should == LogAnalysis::LogAnalysisMapper
49
+ end
50
+
51
+ it 'can create mapper if exists more space' do
52
+ script = create_tmp_script(%Q! dsl "LogAnalysis" !)
53
+ mapper = MapperFactory.create(script, nil, nil)
54
+ mapper.class.should == LogAnalysis::LogAnalysisMapper
55
+ end
56
+
57
+ it 'can create mapper if exists bracket' do
58
+ script = create_tmp_script(%Q! dsl ("LogAnalysis") !)
59
+ mapper = MapperFactory.create(script, nil, nil)
60
+ mapper.class.should == LogAnalysis::LogAnalysisMapper
41
61
  end
42
62
  end
@@ -1,5 +1,4 @@
1
1
  # spec helper
2
-
3
2
  require 'tempfile'
4
3
 
5
4
  def create_tmp_script(body)
@@ -1,19 +1,18 @@
1
1
  require File.join(File.dirname(__FILE__) , 'spec_helper')
2
-
3
2
  require 'util'
4
3
 
5
4
  describe 'utilities' do
6
5
  it 'can change camelcase str to snakecase' do
7
- snake_case('CamelCaseStr').should == 'camel_case_str'
6
+ HadoopDsl.snake_case('CamelCaseStr').should == 'camel_case_str'
8
7
  end
9
8
 
10
9
  it 'can read file and get file data to string' do
11
10
  script_body = 'This is a script body.'
12
11
  @script = create_tmp_script(script_body)
13
- read_file(@script).should == script_body
12
+ HadoopDsl.read_file(@script).should == script_body
14
13
  end
15
14
 
16
15
  it 'raise error if no file in loadpath' do
17
- lambda { read_file('not_exists_on_loadpath') }.should raise_error
16
+ lambda { HadoopDsl.read_file('not_exists_on_loadpath') }.should raise_error
18
17
  end
19
18
  end
@@ -1,4 +1,4 @@
1
- require 'core'
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
2
  require 'word_count'
3
3
 
4
4
  include HadoopDsl::WordCount
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hadoop-rubydsl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-04 00:00:00 +09:00
12
+ date: 2010-01-13 00:00:00 +09:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -32,27 +32,23 @@ extensions: []
32
32
 
33
33
  extra_rdoc_files:
34
34
  - README.rdoc
35
- - TODO
36
35
  files:
37
36
  - .gitignore
38
37
  - README.rdoc
39
38
  - Rakefile
40
- - TODO
41
39
  - VERSION
42
40
  - bin/hadoop-hudson.sh
43
41
  - bin/hadoop-ruby.sh
44
42
  - bin/hrd
45
43
  - conf/hadoop-site.xml
46
- - examples/apachelog-v2-2.rb
47
- - examples/apachelog-v2.rb
48
- - examples/apachelog.rb
49
44
  - examples/hive_like_test.rb
45
+ - examples/log_analysis_test.rb
50
46
  - examples/word_count_test.rb
51
47
  - hadoop-rubydsl.gemspec
52
- - lib/client.rb
53
48
  - lib/core.rb
54
49
  - lib/dsl_init.rb
55
- - lib/hadoop-dsl.rb
50
+ - lib/hadoop_dsl.rb
51
+ - lib/hadoop_dsl_client.rb
56
52
  - lib/hive_like.rb
57
53
  - lib/log_analysis.rb
58
54
  - lib/mapred_factory.rb
@@ -97,8 +93,6 @@ test_files:
97
93
  - spec/hive_like_spec.rb
98
94
  - spec/log_analysis_spec.rb
99
95
  - spec/example_spec.rb
100
- - examples/apachelog-v2.rb
101
96
  - examples/hive_like_test.rb
97
+ - examples/log_analysis_test.rb
102
98
  - examples/word_count_test.rb
103
- - examples/apachelog-v2-2.rb
104
- - examples/apachelog.rb
data/TODO DELETED
@@ -1,2 +0,0 @@
1
- * entire error handling
2
- * "use" method not allowed double quote..
@@ -1,25 +0,0 @@
1
- use 'LogAnalysis'
2
-
3
- data 'apache log on test1' do
4
- from 'apachlog/inputs'
5
- to 'apachlog/outputs'
6
-
7
- each_line do
8
- pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
9
- column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes' # 各カラムにラベルをつける
10
-
11
- topic 'which users?', :label => 'user' do
12
- count_uniq column[:user]
13
- end
14
-
15
- # topic 'access date by monthly' do
16
- # select_date column[:access_date], BY_MONTHLY
17
- # count column[:access_date]
18
- # end
19
- #
20
- # topic 'total bytes' do
21
- # select_date column[:access_date], BY_MONTHLY
22
- # sum column[:bytes].to_kilobytes # / 1024
23
- # end
24
- end
25
- end
@@ -1,15 +0,0 @@
1
- # Apache log analysis
2
- #
3
- # example target data:
4
- # 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
5
- # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
6
- # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
7
-
8
- use 'LogAnalysis'
9
-
10
- data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
11
- column[2].count_uniq
12
- column[3].count_uniq
13
- column[4].count_uniq
14
- column[5].count_uniq
15
- column[6].sum
@@ -1,12 +0,0 @@
1
- require 'jruby-on-hadoop'
2
- require 'client'
3
-
4
- module HadoopDsl
5
- def self.lib_path
6
- File.expand_path(File.dirname(__FILE__))
7
- end
8
-
9
- def self.dsl_init_script
10
- File.join(lib_path, "dsl_init.rb")
11
- end
12
- end