hadoop-rubydsl 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
data/bin/hrd CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'hadoop-dsl'
3
+ require 'hadoop_dsl_client'
4
4
 
5
5
  HadoopDsl::Client.new(ARGV).run
@@ -1,4 +1,4 @@
1
- use 'HiveLike'
1
+ dsl 'HiveLike'
2
2
 
3
3
  # hive-like/items.txt
4
4
  # apple, 3, 100
@@ -1,4 +1,4 @@
1
- use 'LogAnalysis'
1
+ dsl 'LogAnalysis'
2
2
 
3
3
  data 'apache log on test2' do
4
4
  from 'apachelog/inputs'
@@ -14,5 +14,28 @@ data 'apache log on test2' do
14
14
  topic 'ua counts', :label => 'ua' do
15
15
  count_uniq column[:ua]
16
16
  end
17
+
18
+ topic 'count bot', :label => 'bot' do
19
+ ua = column[:ua].value
20
+ bot = ua if ua =~ /bot/i
21
+ count_uniq bot
22
+ end
23
+
24
+ topic 'ua counts group by path' do
25
+ request = column[:request].value
26
+ path = request.split(/\s+/)[1]
27
+ group_by path
28
+ count_uniq column[:ua]
29
+ end
30
+
31
+ topic 'ua counts by daily' do
32
+ group_date_by column[:access_date], :daily
33
+ count_uniq column[:ua]
34
+ end
35
+
36
+ # topic 'total bytes' do
37
+ # select_date column[:access_date], BY_MONTHLY
38
+ # sum column[:bytes].to_kilobytes # / 1024
39
+ # end
17
40
  end
18
41
  end
@@ -1,4 +1,4 @@
1
- use 'WordCount'
1
+ dsl 'WordCount'
2
2
 
3
3
  from 'wc/inputs'
4
4
  to 'wc/outputs'
@@ -5,38 +5,34 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{hadoop-rubydsl}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Koichi Fujikawa"]
12
- s.date = %q{2010-01-04}
12
+ s.date = %q{2010-01-13}
13
13
  s.description = %q{Hadoop Ruby DSL}
14
14
  s.email = %q{fujibee@gmail.com}
15
15
  s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
16
16
  s.extra_rdoc_files = [
17
- "README.rdoc",
18
- "TODO"
17
+ "README.rdoc"
19
18
  ]
20
19
  s.files = [
21
20
  ".gitignore",
22
21
  "README.rdoc",
23
22
  "Rakefile",
24
- "TODO",
25
23
  "VERSION",
26
24
  "bin/hadoop-hudson.sh",
27
25
  "bin/hadoop-ruby.sh",
28
26
  "bin/hrd",
29
27
  "conf/hadoop-site.xml",
30
- "examples/apachelog-v2-2.rb",
31
- "examples/apachelog-v2.rb",
32
- "examples/apachelog.rb",
33
28
  "examples/hive_like_test.rb",
29
+ "examples/log_analysis_test.rb",
34
30
  "examples/word_count_test.rb",
35
31
  "hadoop-rubydsl.gemspec",
36
- "lib/client.rb",
37
32
  "lib/core.rb",
38
33
  "lib/dsl_init.rb",
39
- "lib/hadoop-dsl.rb",
34
+ "lib/hadoop_dsl.rb",
35
+ "lib/hadoop_dsl_client.rb",
40
36
  "lib/hive_like.rb",
41
37
  "lib/log_analysis.rb",
42
38
  "lib/mapred_factory.rb",
@@ -59,11 +55,9 @@ Gem::Specification.new do |s|
59
55
  "spec/hive_like_spec.rb",
60
56
  "spec/log_analysis_spec.rb",
61
57
  "spec/example_spec.rb",
62
- "examples/apachelog-v2.rb",
63
58
  "examples/hive_like_test.rb",
64
- "examples/word_count_test.rb",
65
- "examples/apachelog-v2-2.rb",
66
- "examples/apachelog.rb"
59
+ "examples/log_analysis_test.rb",
60
+ "examples/word_count_test.rb"
67
61
  ]
68
62
 
69
63
  if s.respond_to? :specification_version then
@@ -1,10 +1,33 @@
1
- require 'util'
1
+ require 'hadoop_dsl'
2
2
  require 'forwardable'
3
3
 
4
4
  module HadoopDsl
5
+ # common
6
+ module DslElement
7
+ # all DSL statements without def is processed here
8
+ def method_missing(method_name, *args)
9
+ yield if block_given?
10
+ self
11
+ end
12
+ end
13
+
5
14
  # controller
15
+ module DslController
16
+ include DslElement
17
+
18
+ def run
19
+ body = pre_process(HadoopDsl.read_file(@script))
20
+ eval(body, binding, @script)
21
+ end
22
+
23
+ def pre_process(body)
24
+ body # do nothing
25
+ end
26
+ end
27
+
6
28
  class BaseMapRed
7
29
  extend Forwardable
30
+ include DslController
8
31
 
9
32
  attr_reader :emitted
10
33
 
@@ -14,66 +37,54 @@ module HadoopDsl
14
37
  @emitted = []
15
38
  end
16
39
 
17
- def run
18
- body = pre_process(read_file(@script))
19
- eval(body, binding, @script)
20
- end
21
-
22
- def pre_process(body)
23
- body # do nothing
24
- end
25
-
26
40
  def emit(hash) @emitted << hash end
27
41
 
28
- # all DSL statements without def is processed here
29
- def method_missing(method_name, *args) self end
42
+ private
43
+ def key; @model.key end
30
44
  end
31
45
 
32
46
  class BaseSetup
47
+ include DslController
48
+
33
49
  def initialize(script, conf)
34
50
  @script, @conf = script, conf
35
51
  output_format
36
52
  end
37
53
 
38
- def run
39
- body = pre_process(read_file(@script))
40
- eval(body, binding, @script)
41
- end
42
-
43
- def pre_process(body)
44
- body # do nothing
45
- end
46
-
47
- # do nothing
48
- def output_format; end
49
-
54
+ def output_format; end # do nothing
50
55
  def paths; [@from, @to] end
51
-
52
56
  def from(path) @from = path end
53
57
  def to(path) @to = path end
54
-
55
- # all DSL statements without def is processed here
56
- def method_missing(method_name, *args) self end
57
58
  end
58
59
 
59
60
  class BaseMapper < BaseMapRed
60
- def initialize(script, model)
61
- super(script, model)
61
+ # common functions
62
+ def identity
63
+ emit(@model.key => @model.value)
62
64
  end
65
+
66
+ private
67
+ def value; @model.values end
63
68
  end
64
69
 
65
70
  class BaseReducer < BaseMapRed
66
- def initialize(script, model)
67
- super(script, model)
71
+ # common functions
72
+ def aggregate
73
+ emit(@model.key => @model.values.inject {|ret, i| ret + i})
74
+ end
75
+
76
+ def identity
77
+ @model.values.each {|v| emit(@model.key => v)}
68
78
  end
79
+
80
+ private
81
+ def values; @model.values end
69
82
  end
70
83
 
71
84
  # model
72
85
  class BaseModel
86
+ include DslElement
73
87
  attr_accessor :controller
74
-
75
- # all DSL statements without def is processed here
76
- def method_missing(method_name, *args) self end
77
88
  end
78
89
 
79
90
  class BaseMapperModel < BaseModel
@@ -82,11 +93,6 @@ module HadoopDsl
82
93
  def initialize(key, value)
83
94
  @key, @value = key, value
84
95
  end
85
-
86
- # common functions
87
- def identity
88
- @controller.emit(@key => @value)
89
- end
90
96
  end
91
97
 
92
98
  class BaseReducerModel < BaseModel
@@ -95,14 +101,5 @@ module HadoopDsl
95
101
  def initialize(key, values)
96
102
  @key, @values = key, values
97
103
  end
98
-
99
- # common functions
100
- def aggregate
101
- @controller.emit(@key => @values.inject {|ret, i| ret + i})
102
- end
103
-
104
- def identity
105
- @values.each {|v| @controller.emit(@key => v)}
106
- end
107
104
  end
108
105
  end
@@ -1,16 +1,7 @@
1
- require 'core'
2
- require 'java'
3
- require 'mapred_factory'
4
-
5
- import 'org.apache.hadoop.io.IntWritable'
6
- import 'org.apache.hadoop.io.Text'
1
+ require 'hadoop_dsl'
7
2
 
8
3
  include HadoopDsl
9
4
 
10
- # Hadoop IO types
11
- HadoopDsl::Text = Text
12
- HadoopDsl::IntWritable = IntWritable
13
-
14
5
  def map(key, value, output, reporter, script)
15
6
  mapper = MapperFactory.create(script, key, value)
16
7
  mapper.run
@@ -0,0 +1,14 @@
1
+ require 'util'
2
+ require 'mapred_factory'
3
+ require 'core'
4
+
5
+ # for jruby
6
+ if defined? JRUBY_VERSION
7
+ require 'java'
8
+ import 'org.apache.hadoop.io.IntWritable'
9
+ import 'org.apache.hadoop.io.Text'
10
+
11
+ # Hadoop IO types
12
+ HadoopDsl::Text = Text
13
+ HadoopDsl::IntWritable = IntWritable
14
+ end
@@ -1,4 +1,14 @@
1
+ require 'jruby-on-hadoop'
2
+
1
3
  module HadoopDsl
4
+ def self.lib_path
5
+ File.expand_path(File.dirname(__FILE__))
6
+ end
7
+
8
+ def self.dsl_init_script
9
+ File.join(lib_path, "dsl_init.rb")
10
+ end
11
+
2
12
  class Client < JRubyOnHadoop::Client
3
13
  def parse_args
4
14
  super
@@ -1,11 +1,6 @@
1
- require 'core'
2
- require 'enumerator'
1
+ require 'hadoop_dsl'
3
2
 
4
3
  module HadoopDsl::HiveLike
5
- include HadoopDsl
6
-
7
- AVAILABLE_METHODS = [:select, :create_table, :table]
8
-
9
4
  # common
10
5
  module HiveLikeMapRed
11
6
  def pre_process(body)
@@ -17,7 +12,7 @@ module HadoopDsl::HiveLike
17
12
  args = sprit_and_marge_args($2)
18
13
  processed << "#{method}(#{args})\n"
19
14
  else
20
- processed << line + "\n"
15
+ processed << line + "\n" if line
21
16
  end
22
17
  end
23
18
  processed
@@ -32,15 +27,15 @@ module HadoopDsl::HiveLike
32
27
  end
33
28
 
34
29
  # controller
35
- class HiveLikeSetup < BaseSetup
30
+ class HiveLikeSetup < HadoopDsl::BaseSetup
36
31
  def load_data(inputs, table)
37
32
  @from = inputs
38
33
  @to = inputs.gsub(/#{File.basename(inputs)}$/, 'outputs')
39
34
  end
40
35
 
41
36
  def output_format
42
- @conf.output_key_class = Text
43
- @conf.output_value_class = Text
37
+ @conf.output_key_class = HadoopDsl::Text
38
+ @conf.output_value_class = HadoopDsl::Text
44
39
  end
45
40
 
46
41
  # might not need but occur error if not exists
@@ -49,37 +44,43 @@ module HadoopDsl::HiveLike
49
44
  include HiveLikeMapRed
50
45
  end
51
46
 
52
- class HiveLikeMapper < BaseMapper
47
+ class HiveLikeMapper < HadoopDsl::BaseMapper
53
48
  def initialize(script, key, value)
54
49
  super(script, HiveLikeMapperModel.new(key, value))
55
50
  end
56
51
 
57
52
  include HiveLikeMapRed
58
53
 
59
- # model methods
60
- def_delegators :@model, *AVAILABLE_METHODS
54
+ def_delegators :@model, :create_table, :table
55
+
56
+ # emitters
57
+ def select(*args)
58
+ from_index = args.index('from')
59
+ if from_index
60
+ values = args[0...from_index].map do |column|
61
+ splitted = @model.value.split(/[,\s]+/)
62
+ splitted[@model.table.columns.index(column)]
63
+ end
64
+ emit(args[from_index + 1] => values.join(", "))
65
+ end
66
+ end
61
67
  end
62
68
 
63
- class HiveLikeReducer < BaseReducer
69
+ class HiveLikeReducer < HadoopDsl::BaseReducer
64
70
  def initialize(script, key, values)
65
71
  super(script, HiveLikeReducerModel.new(key, values))
66
72
  end
67
73
 
68
74
  include HiveLikeMapRed
69
75
 
70
- # model methods
71
- def_delegators :@model, *AVAILABLE_METHODS
76
+ # emitters
77
+ def select(*args) identity end
72
78
  end
73
79
 
74
80
  # model
75
- class HiveLikeMapperModel < BaseMapperModel
81
+ class HiveLikeMapperModel < HadoopDsl::BaseMapperModel
76
82
  attr_reader :table
77
83
 
78
- def initialize(key, value)
79
- super(key, value)
80
- end
81
-
82
- # emitters
83
84
  def create_table(name, *column_and_type)
84
85
  @table = Table.new(name)
85
86
  column_and_type.each_with_index do |column, index|
@@ -88,17 +89,6 @@ module HadoopDsl::HiveLike
88
89
  end
89
90
  end
90
91
 
91
- def select(*args)
92
- from_index = args.index('from')
93
- if from_index
94
- values = args[0...from_index].map do |column|
95
- splitted = @value.split(/[,\s]+/)
96
- splitted[@table.columns.index(column)]
97
- end
98
- @controller.emit(args[from_index + 1] => values.join(", "))
99
- end
100
- end
101
-
102
92
  class Table
103
93
  attr_reader :name, :columns
104
94
 
@@ -111,12 +101,6 @@ module HadoopDsl::HiveLike
111
101
  end
112
102
  end
113
103
 
114
- class HiveLikeReducerModel < BaseReducerModel
115
- def initialize(key, values)
116
- super(key, values)
117
- end
118
-
119
- # emitters
120
- def select(*args) identity end
104
+ class HiveLikeReducerModel < HadoopDsl::BaseReducerModel
121
105
  end
122
106
  end
@@ -1,55 +1,123 @@
1
- require 'core'
1
+ require 'hadoop_dsl'
2
2
  require 'enumerator'
3
3
 
4
4
  module HadoopDsl::LogAnalysis
5
- include HadoopDsl
6
-
7
5
  KEY_SEP = "\t"
8
6
  PREFIX = 'col'
9
7
  PASS = nil
10
- AVAILABLE_METHODS = [:separate, :pattern, :column_name, :column, :topic, :value, :count_uniq, :sum]
8
+ MODEL_METHODS = [:column, :value]
11
9
 
12
- # common
13
- module LogAnalysisMapRed
14
- # entry point
15
- def data(description = '', &block) yield end
10
+ # controller
11
+ class LogAnalysisMapper < HadoopDsl::BaseMapper
12
+ def initialize(script, key, value)
13
+ super(script, LogAnalysisMapperModel.new(key, value))
14
+ end
16
15
 
17
- def each_line(&block) yield end
18
- end
16
+ # model methods
17
+ def_delegators :@model, *MODEL_METHODS
18
+
19
+ def topic(desc, options = {}, &block)
20
+ @model.create_topic(desc, options)
21
+ yield if block_given?
22
+ current_topic
23
+ end
19
24
 
20
- # controller
21
- class LogAnalysisSetup < BaseSetup
22
- def initialize(script, conf)
23
- super(script, conf)
25
+ def separate(sep)
26
+ parts = value.split(sep)
27
+ @model.create_or_replace_columns_with(parts) {|column, value| column.value = value}
24
28
  end
25
29
 
26
- include LogAnalysisMapRed
27
- end
30
+ def pattern(re)
31
+ if value =~ re
32
+ md = Regexp.last_match
33
+ @model.create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
34
+ end
35
+ end
28
36
 
29
- class LogAnalysisMapper < BaseMapper
30
- def initialize(script, key, value)
31
- super(script, LogAnalysisMapperModel.new(key, value))
37
+ # column names by String converted to Symbol
38
+ def column_name(*names)
39
+ sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
40
+ @model.create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
32
41
  end
33
42
 
34
- include LogAnalysisMapRed
43
+ def group_by(column_or_value)
44
+ case column_or_value
45
+ when LogAnalysisMapperModel::Column
46
+ column = column_or_value
47
+ current_topic.key_elements << column.value
48
+ else
49
+ value = column_or_value
50
+ current_topic.key_elements << value
51
+ end
52
+ end
35
53
 
36
- # model methods
37
- def_delegators :@model, *AVAILABLE_METHODS
54
+ def group_date_by(column, term)
55
+ require 'time'
56
+ time = parse_time(column.value)
57
+ time_key = case term
58
+ when :daily then time.strftime('%Y%m%d')
59
+ when :monthly then time.strftime('%Y%m')
60
+ when :yearly then time.strftime('%Y')
61
+ end
62
+ current_topic.key_elements << time_key
63
+ end
64
+
65
+ # emitters
66
+ def count_uniq(column_or_value)
67
+ uniq_key =
68
+ case column_or_value
69
+ when LogAnalysisMapperModel::Column
70
+ column = column_or_value
71
+ column.value
72
+ else column_or_value # value
73
+ end
74
+ current_topic.key_elements << uniq_key
75
+ emit(current_topic.key => 1)
76
+ end
77
+
78
+ def sum(column)
79
+ emit(current_topic.key => column.value.to_i)
80
+ end
81
+
82
+ private
83
+ def current_topic; @model.current_topic end
84
+
85
+ def parse_time(str)
86
+ begin Time.parse(str)
87
+ rescue
88
+ # apachelog pattern ex) "10/Oct/2000:13:55:36 -0700"
89
+ Time.parse($1) if str =~ /^(\d*\/\w*\/\d*):/
90
+ end
91
+ end
38
92
  end
39
93
 
40
- class LogAnalysisReducer < BaseReducer
94
+ class LogAnalysisReducer < HadoopDsl::BaseReducer
41
95
  def initialize(script, key, values)
42
96
  super(script, LogAnalysisReducerModel.new(key, values))
43
97
  end
44
98
 
45
- include LogAnalysisMapRed
46
-
47
99
  # model methods
48
- def_delegators :@model, *AVAILABLE_METHODS
100
+ def_delegators :@model, *MODEL_METHODS
101
+
102
+ def topic(desc, options = {}, &block)
103
+ @model.create_topic(desc, options)
104
+ yield if block_given?
105
+ @model.current_topic
106
+ end
107
+
108
+ def count_uniq(column)
109
+ aggregate if @model.topic == @model.current_topic
110
+ end
111
+
112
+ def sum(column)
113
+ aggregate if @model.topic == @model.current_topic
114
+ end
49
115
  end
50
116
 
51
117
  # model
52
- class LogAnalysisMapperModel < BaseMapperModel
118
+ class LogAnalysisMapperModel < HadoopDsl::BaseMapperModel
119
+ attr_reader :current_topic
120
+
53
121
  def initialize(key, value)
54
122
  super(key, value)
55
123
  @columns = ColumnArray.new
@@ -58,28 +126,8 @@ module HadoopDsl::LogAnalysis
58
126
 
59
127
  def column; @columns end
60
128
 
61
- def topic(desc, options = {}, &block)
129
+ def create_topic(desc, options)
62
130
  @topics << @current_topic = Topic.new(desc, options[:label])
63
- yield if block_given?
64
- @current_topic
65
- end
66
-
67
- def separate(sep)
68
- parts = @value.split(sep)
69
- create_or_replace_columns_with(parts) {|column, value| column.value = value}
70
- end
71
-
72
- def pattern(re)
73
- if @value =~ re
74
- md = Regexp.last_match
75
- create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
76
- end
77
- end
78
-
79
- # column names by String converted to Symbol
80
- def column_name(*names)
81
- sym_names = names.map {|name| name.is_a?(String) ? name.to_sym : name }
82
- create_or_replace_columns_with(sym_names) {|column, name| column.name = name}
83
131
  end
84
132
 
85
133
  def create_or_replace_columns_with(array, &block)
@@ -91,15 +139,6 @@ module HadoopDsl::LogAnalysis
91
139
  @columns = ColumnArray.new(columns)
92
140
  end
93
141
 
94
- # emitters
95
- def count_uniq(column)
96
- @controller.emit([@current_topic.label, KEY_SEP, column.value].join => 1)
97
- end
98
-
99
- def sum(column)
100
- @controller.emit([@current_topic.label].join => column.value.to_i)
101
- end
102
-
103
142
  class ColumnArray < Array
104
143
  def [](key)
105
144
  case key
@@ -120,17 +159,28 @@ module HadoopDsl::LogAnalysis
120
159
  end
121
160
 
122
161
  class Topic
162
+ attr_reader :key_elements
163
+
123
164
  def initialize(desc, label = nil)
124
165
  @desc, @label = desc, label
166
+ @key_elements = []
125
167
  end
126
168
 
127
169
  def label
128
170
  @label || @desc.gsub(/\s/, '_')
129
171
  end
172
+
173
+ def key
174
+ without_label =
175
+ @key_elements.size > 0 ? @key_elements.join(KEY_SEP) : nil
176
+ [label, without_label].compact.join(KEY_SEP)
177
+ end
130
178
  end
131
179
  end
132
180
 
133
- class LogAnalysisReducerModel < BaseReducerModel
181
+ class LogAnalysisReducerModel < HadoopDsl::BaseReducerModel
182
+ attr_reader :topic, :current_topic
183
+
134
184
  def initialize(key, values)
135
185
  super(key, values)
136
186
  if key =~ /(\w*)#{KEY_SEP}?(.*)/
@@ -138,18 +188,8 @@ module HadoopDsl::LogAnalysis
138
188
  end
139
189
  end
140
190
 
141
- def topic(desc, options = {}, &block)
191
+ def create_topic(desc, options)
142
192
  @current_topic = Topic.new(options[:label] || desc.gsub(/\s/, '_'), nil)
143
- yield if block_given?
144
- @current_topic
145
- end
146
-
147
- def count_uniq(column)
148
- aggregate if @topic == @current_topic
149
- end
150
-
151
- def sum(column)
152
- aggregate if @topic == @current_topic
153
193
  end
154
194
 
155
195
  class Topic
@@ -1,16 +1,16 @@
1
- require 'util'
1
+ require 'hadoop_dsl'
2
2
 
3
3
  module HadoopDsl
4
4
  class MapRedFactory
5
5
  def self.dsl_name(script)
6
- read_file(script).each_line do |line|
7
- dsl_name = $1 if line =~ /^use\s*'(\w*)'/
8
- return dsl_name
6
+ HadoopDsl.read_file(script).each_line do |line|
7
+ dsl_name = $1 if line =~ /\s*dsl\s*\(?["'](\w*)["']\)?/
8
+ return dsl_name if dsl_name
9
9
  end
10
10
  end
11
11
 
12
12
  def self.require_dsl_lib(dsl_name)
13
- require snake_case(dsl_name)
13
+ require HadoopDsl.snake_case(dsl_name)
14
14
  end
15
15
  end
16
16
 
@@ -1,18 +1,18 @@
1
1
  # utility functions
2
+ require 'hadoop_dsl'
2
3
 
3
4
  module HadoopDsl
4
- def snake_case(str)
5
+ def self.snake_case(str)
5
6
  str.gsub(/\B[A-Z]/, '_\&').downcase
6
7
  end
7
8
 
8
- def read_file(file_name)
9
+ def self.read_file(file_name)
9
10
  # read as usual
10
11
  body = File.open(file_name).read rescue nil
11
12
  return body if body
12
13
 
13
14
  # read from loadpath
14
15
  $:.each do |path|
15
- p path
16
16
  body = File.open(File.join(path, file_name)).read rescue next
17
17
  return body if body
18
18
  end
@@ -1,76 +1,56 @@
1
- require 'core'
1
+ require 'hadoop_dsl'
2
2
  require 'enumerator'
3
3
 
4
4
  module HadoopDsl::WordCount
5
- include HadoopDsl
6
-
7
- AVAILABLE_METHODS = [:count_uniq, :total]
5
+ MODEL_METHODS = []
8
6
  TOTAL_PREFIX = "\t"
9
7
 
10
- # common
11
- module WordCountMapRed
12
- # entry point
13
- def data(description = '', &block) yield end
14
- end
15
-
16
8
  # controller
17
- class WordCountMapper < BaseMapper
9
+ class WordCountMapper < HadoopDsl::BaseMapper
18
10
  def initialize(script, key, value)
19
11
  super(script, WordCountMapperModel.new(key, value))
20
12
  end
21
13
 
22
- include WordCountMapRed
23
-
24
- # model methods
25
- def_delegators :@model, *AVAILABLE_METHODS
26
- end
27
-
28
- class WordCountReducer < BaseReducer
29
- def initialize(script, key, values)
30
- super(script, WordCountReducerModel.new(key, values))
31
- end
32
-
33
- include WordCountMapRed
34
-
35
14
  # model methods
36
- def_delegators :@model, *AVAILABLE_METHODS
37
- end
38
-
39
- # model
40
- class WordCountMapperModel < BaseMapperModel
41
- def initialize(key, value)
42
- super(key, value)
43
- end
15
+ def_delegators :@model, *MODEL_METHODS
44
16
 
45
17
  # emitters
46
18
  def count_uniq
47
- @value.split.each {|word| @controller.emit(word => 1)}
19
+ @model.value.split.each {|word| emit(word => 1)}
48
20
  end
49
21
 
50
22
  def total(*types)
51
23
  types.each do |type|
52
24
  case type
53
25
  when :bytes
54
- @controller.emit("#{TOTAL_PREFIX}total bytes" => @value.gsub(/\s/, '').length)
26
+ emit("#{TOTAL_PREFIX}total bytes" => @model.value.gsub(/\s/, '').length)
55
27
  when :words
56
- @controller.emit("#{TOTAL_PREFIX}total words" => @value.split.size)
28
+ emit("#{TOTAL_PREFIX}total words" => @model.value.split.size)
57
29
  when :lines
58
- @controller.emit("#{TOTAL_PREFIX}total lines" => 1)
30
+ emit("#{TOTAL_PREFIX}total lines" => 1)
59
31
  end
60
32
  end
61
33
  end
62
34
  end
63
35
 
64
- class WordCountReducerModel < BaseReducerModel
65
- def initialize(key, values)
66
- super(key, values)
36
+ class WordCountReducer < HadoopDsl::BaseReducer
37
+ def initialize(script, key, values)
38
+ super(script, WordCountReducerModel.new(key, values))
67
39
  end
68
40
 
41
+ # model methods
42
+ def_delegators :@model, *MODEL_METHODS
43
+
69
44
  # emitters
70
- def count_uniq; aggregate unless total_value? end
71
- def total(*types); aggregate if total_value? end
45
+ def count_uniq; aggregate unless @model.total_value? end
46
+ def total(*types); aggregate if @model.total_value? end
47
+ end
48
+
49
+ # model
50
+ class WordCountMapperModel < HadoopDsl::BaseMapperModel
51
+ end
72
52
 
73
- private
53
+ class WordCountReducerModel < HadoopDsl::BaseReducerModel
74
54
  def total_value?; @key =~ /^#{TOTAL_PREFIX}/ end
75
55
  end
76
56
  end
@@ -1,4 +1,5 @@
1
- require 'hadoop-dsl'
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+ require 'hadoop_dsl_client'
2
3
 
3
4
  describe HadoopDsl::Client do
4
5
  before do
@@ -1,4 +1,4 @@
1
- require 'dsl_init'
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
2
  require 'core'
3
3
 
4
4
  include HadoopDsl
@@ -38,7 +38,7 @@ to 'test/outputs'
38
38
  it 'can emit as identity' do
39
39
  model = BaseMapperModel.new('key', 'value')
40
40
  mapper = BaseMapper.new(@script, model)
41
- model.identity
41
+ mapper.identity
42
42
 
43
43
  mapper.emitted.should == [{'key' => 'value'}]
44
44
  end
@@ -48,7 +48,7 @@ to 'test/outputs'
48
48
  it 'can emit as aggregate' do
49
49
  model = BaseReducerModel.new('key', [1, 2, 3])
50
50
  reducer = BaseReducer.new(@script, model)
51
- model.aggregate
51
+ reducer.aggregate
52
52
 
53
53
  reducer.emitted.should == [{'key' => 6}]
54
54
  end
@@ -56,7 +56,7 @@ to 'test/outputs'
56
56
  it 'can emit as identity' do
57
57
  model = BaseReducerModel.new('key', [1, 2, 3])
58
58
  reducer = BaseReducer.new(@script, model)
59
- model.identity
59
+ reducer.identity
60
60
 
61
61
  reducer.emitted.should == [{'key' => 1}, {'key' => 2}, {'key' => 3}]
62
62
  end
@@ -4,7 +4,7 @@ describe 'mapreduce init' do
4
4
 
5
5
  before(:all) do
6
6
  @script = create_tmp_script(<<-EOF)
7
- use 'LogAnalysis'
7
+ dsl 'LogAnalysis'
8
8
  data 'test' do
9
9
  from 'test/inputs'
10
10
  to 'test/outputs'
@@ -1,23 +1,25 @@
1
1
  require 'log_analysis'
2
2
  require 'word_count'
3
+ require 'hive_like'
3
4
 
4
5
  include HadoopDsl::LogAnalysis
5
6
  describe 'Aapach Log Example' do
6
7
  before(:all) do
7
- @script = File.join(File.dirname(__FILE__), '..', 'examples', 'apachelog-v2.rb')
8
- @value = '127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326'
8
+ @script = File.join(File.dirname(__FILE__), '..', 'examples', 'log_analysis_test.rb')
9
+ @bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
10
+ @value = %Q!127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 "-" "#{@bot_ua}"!
9
11
  end
10
12
 
11
13
  it 'can run example by mapper' do
12
14
  mapper = LogAnalysisMapper.new(@script, nil, @value)
13
15
  mapper.run
14
- mapper.emitted.first["user\tfrank"].should == 1
16
+ mapper.emitted.first.should == {"ua\t#{@bot_ua}" => 1}
15
17
  end
16
18
 
17
19
  it 'can run example by reducer' do
18
- reducer = LogAnalysisReducer.new(@script, "user\tfrank", [1, 1, 1])
20
+ reducer = LogAnalysisReducer.new(@script, "ua\tChrome", [1, 1, 1])
19
21
  reducer.run
20
- reducer.emitted.first["user\tfrank"].should == 3
22
+ reducer.emitted.first["ua\tChrome"].should == 3
21
23
  end
22
24
  end
23
25
 
@@ -1,4 +1,4 @@
1
- require 'core'
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
2
  require 'hive_like'
3
3
 
4
4
  include HadoopDsl::HiveLike
@@ -1,4 +1,4 @@
1
- require 'core'
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
2
  require 'log_analysis'
3
3
 
4
4
  include HadoopDsl::LogAnalysis
@@ -39,13 +39,22 @@ describe LogAnalysisMapper do
39
39
  mapper.column[:user].value.should == 'frank'
40
40
  end
41
41
 
42
- it 'should count uniq column' do
42
+ it 'should count uniq by column' do
43
43
  value = 'count uniq'
44
44
  mapper = LogAnalysisMapper.new(nil, nil, value)
45
45
  mapper.separate(' ')
46
46
  mapper.topic('t1') { mapper.count_uniq mapper.column[1] }
47
47
 
48
- mapper.emitted.first["t1\tuniq"].should == 1
48
+ mapper.emitted.should == [{"t1\tuniq" => 1}]
49
+ end
50
+
51
+ it 'should count uniq by value' do
52
+ value = 'count uniq'
53
+ mapper = LogAnalysisMapper.new(nil, nil, value)
54
+ mapper.separate(' ')
55
+ mapper.topic('t1') { mapper.count_uniq 'orig value' }
56
+
57
+ mapper.emitted.should == [{"t1\torig value" => 1}]
49
58
  end
50
59
 
51
60
  it 'should sum column value' do
@@ -83,6 +92,54 @@ describe LogAnalysisMapper do
83
92
  topic = mapper.topic('desc with space')
84
93
  topic.label.should == 'desc_with_space'
85
94
  end
95
+
96
+ it 'can group date monthly' do
97
+ value = '2010/1/1 newyearday'
98
+ mapper = LogAnalysisMapper.new(nil, nil, value)
99
+ mapper.separate(' ')
100
+ mapper.column_name 'date', 'holiday'
101
+
102
+ ['yearly', 'monthly', 'daily'].each do |term|
103
+ mapper.topic(term) do
104
+ mapper.group_date_by mapper.column[:date], term.to_sym
105
+ mapper.count_uniq mapper.column[:holiday]
106
+ end
107
+ end
108
+ mapper.emitted.should ==
109
+ [
110
+ {"yearly\t2010\tnewyearday" => 1},
111
+ {"monthly\t201001\tnewyearday" => 1},
112
+ {"daily\t20100101\tnewyearday" => 1}
113
+ ]
114
+ end
115
+
116
+ it 'can group by' do
117
+ value = '1 sub_2 bingo!'
118
+ mapper = LogAnalysisMapper.new(nil, nil, value)
119
+ mapper.separate(' ')
120
+ mapper.column_name 'id', 'sub_id', 'data'
121
+
122
+ mapper.topic('test') do
123
+ mapper.group_by mapper.column[:sub_id]
124
+ mapper.count_uniq mapper.column[:data]
125
+ end
126
+ mapper.emitted.should == [{"test\tsub_2\tbingo!" => 1}]
127
+ end
128
+ end
129
+
130
+ Topic = LogAnalysisMapperModel::Topic
131
+ describe Topic do
132
+ it 'can get key with label' do
133
+ t = Topic.new('label')
134
+ t.key.should == 'label'
135
+ end
136
+
137
+ it 'can get key with label and elements' do
138
+ t = Topic.new('label')
139
+ t.key_elements << 'e1'
140
+ t.key_elements << 'e2'
141
+ t.key.should == "label\te1\te2"
142
+ end
86
143
  end
87
144
 
88
145
  describe LogAnalysisReducer do
@@ -1,31 +1,33 @@
1
1
  require File.join(File.dirname(__FILE__) , 'spec_helper')
2
-
3
2
  require 'mapred_factory'
4
- require 'log_analysis'
5
3
 
6
- describe 'MapRed Factory' do
4
+ include HadoopDsl
7
5
 
6
+ describe 'MapRed Factory' do
8
7
  before(:all) do
9
- @script = create_tmp_script("use 'LogAnalysis'")
8
+ @script = create_tmp_script("dsl 'LogAnalysis'")
10
9
  end
11
10
 
12
11
  it 'can create mapper' do
13
12
  mapper = MapperFactory.create(@script, nil, nil)
14
- mapper.class.should == LogAnalysisMapper
13
+ mapper.class.should == LogAnalysis::LogAnalysisMapper
15
14
  end
16
15
 
17
16
  it 'can create reducer' do
18
17
  reducer = ReducerFactory.create(@script, nil, nil)
19
- reducer.class.should == LogAnalysisReducer
18
+ reducer.class.should == LogAnalysis::LogAnalysisReducer
20
19
  end
21
20
 
22
21
  it 'can create setup' do
23
- s = SetupFactory.create(@script, nil)
24
- s.class.should == LogAnalysisSetup
22
+ conf = mock('conf')
23
+ conf.should_receive(:output_key_class=).once
24
+ conf.should_receive(:output_value_class=).once
25
+ s = SetupFactory.create(create_tmp_script("dsl 'HiveLike'"), conf)
26
+ s.class.should == HiveLike::HiveLikeSetup
25
27
  end
26
28
 
27
29
  it 'can create base if not exists in specific DSL' do
28
- s = SetupFactory.create(create_tmp_script("use 'WordCount'"), nil)
30
+ s = SetupFactory.create(create_tmp_script("dsl 'WordCount'"), nil)
29
31
  s.class.should == BaseSetup
30
32
  end
31
33
 
@@ -37,6 +39,24 @@ describe 'MapRed Factory' do
37
39
  it 'can convert dsl name to dsl lib file and require' do
38
40
  dsl_name = MapRedFactory.dsl_name(@script)
39
41
  MapRedFactory.require_dsl_lib(dsl_name).should_not be_nil
40
- LogAnalysisMapper
42
+ LogAnalysis::LogAnalysisMapper
43
+ end
44
+
45
+ it 'can create mapper if statement has double quote' do
46
+ script = create_tmp_script(%Q!dsl "LogAnalysis"!)
47
+ mapper = MapperFactory.create(script, nil, nil)
48
+ mapper.class.should == LogAnalysis::LogAnalysisMapper
49
+ end
50
+
51
+ it 'can create mapper if exists more space' do
52
+ script = create_tmp_script(%Q! dsl "LogAnalysis" !)
53
+ mapper = MapperFactory.create(script, nil, nil)
54
+ mapper.class.should == LogAnalysis::LogAnalysisMapper
55
+ end
56
+
57
+ it 'can create mapper if exists bracket' do
58
+ script = create_tmp_script(%Q! dsl ("LogAnalysis") !)
59
+ mapper = MapperFactory.create(script, nil, nil)
60
+ mapper.class.should == LogAnalysis::LogAnalysisMapper
41
61
  end
42
62
  end
@@ -1,5 +1,4 @@
1
1
  # spec helper
2
-
3
2
  require 'tempfile'
4
3
 
5
4
  def create_tmp_script(body)
@@ -1,19 +1,18 @@
1
1
  require File.join(File.dirname(__FILE__) , 'spec_helper')
2
-
3
2
  require 'util'
4
3
 
5
4
  describe 'utilities' do
6
5
  it 'can change camelcase str to snakecase' do
7
- snake_case('CamelCaseStr').should == 'camel_case_str'
6
+ HadoopDsl.snake_case('CamelCaseStr').should == 'camel_case_str'
8
7
  end
9
8
 
10
9
  it 'can read file and get file data to string' do
11
10
  script_body = 'This is a script body.'
12
11
  @script = create_tmp_script(script_body)
13
- read_file(@script).should == script_body
12
+ HadoopDsl.read_file(@script).should == script_body
14
13
  end
15
14
 
16
15
  it 'raise error if no file in loadpath' do
17
- lambda { read_file('not_exists_on_loadpath') }.should raise_error
16
+ lambda { HadoopDsl.read_file('not_exists_on_loadpath') }.should raise_error
18
17
  end
19
18
  end
@@ -1,4 +1,4 @@
1
- require 'core'
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
2
  require 'word_count'
3
3
 
4
4
  include HadoopDsl::WordCount
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hadoop-rubydsl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-04 00:00:00 +09:00
12
+ date: 2010-01-13 00:00:00 +09:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -32,27 +32,23 @@ extensions: []
32
32
 
33
33
  extra_rdoc_files:
34
34
  - README.rdoc
35
- - TODO
36
35
  files:
37
36
  - .gitignore
38
37
  - README.rdoc
39
38
  - Rakefile
40
- - TODO
41
39
  - VERSION
42
40
  - bin/hadoop-hudson.sh
43
41
  - bin/hadoop-ruby.sh
44
42
  - bin/hrd
45
43
  - conf/hadoop-site.xml
46
- - examples/apachelog-v2-2.rb
47
- - examples/apachelog-v2.rb
48
- - examples/apachelog.rb
49
44
  - examples/hive_like_test.rb
45
+ - examples/log_analysis_test.rb
50
46
  - examples/word_count_test.rb
51
47
  - hadoop-rubydsl.gemspec
52
- - lib/client.rb
53
48
  - lib/core.rb
54
49
  - lib/dsl_init.rb
55
- - lib/hadoop-dsl.rb
50
+ - lib/hadoop_dsl.rb
51
+ - lib/hadoop_dsl_client.rb
56
52
  - lib/hive_like.rb
57
53
  - lib/log_analysis.rb
58
54
  - lib/mapred_factory.rb
@@ -97,8 +93,6 @@ test_files:
97
93
  - spec/hive_like_spec.rb
98
94
  - spec/log_analysis_spec.rb
99
95
  - spec/example_spec.rb
100
- - examples/apachelog-v2.rb
101
96
  - examples/hive_like_test.rb
97
+ - examples/log_analysis_test.rb
102
98
  - examples/word_count_test.rb
103
- - examples/apachelog-v2-2.rb
104
- - examples/apachelog.rb
data/TODO DELETED
@@ -1,2 +0,0 @@
1
- * entire error handling
2
- * "use" method not allowed double quote..
@@ -1,25 +0,0 @@
1
- use 'LogAnalysis'
2
-
3
- data 'apache log on test1' do
4
- from 'apachlog/inputs'
5
- to 'apachlog/outputs'
6
-
7
- each_line do
8
- pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
9
- column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes' # 各カラムにラベルをつける
10
-
11
- topic 'which users?', :label => 'user' do
12
- count_uniq column[:user]
13
- end
14
-
15
- # topic 'access date by monthly' do
16
- # select_date column[:access_date], BY_MONTHLY
17
- # count column[:access_date]
18
- # end
19
- #
20
- # topic 'total bytes' do
21
- # select_date column[:access_date], BY_MONTHLY
22
- # sum column[:bytes].to_kilobytes # / 1024
23
- # end
24
- end
25
- end
@@ -1,15 +0,0 @@
1
- # Apache log analysis
2
- #
3
- # example target data:
4
- # 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
5
- # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
6
- # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
7
-
8
- use 'LogAnalysis'
9
-
10
- data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
11
- column[2].count_uniq
12
- column[3].count_uniq
13
- column[4].count_uniq
14
- column[5].count_uniq
15
- column[6].sum
@@ -1,12 +0,0 @@
1
- require 'jruby-on-hadoop'
2
- require 'client'
3
-
4
- module HadoopDsl
5
- def self.lib_path
6
- File.expand_path(File.dirname(__FILE__))
7
- end
8
-
9
- def self.dsl_init_script
10
- File.join(lib_path, "dsl_init.rb")
11
- end
12
- end