hadoop-rubydsl 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +2 -2
- data/VERSION +1 -1
- data/examples/log_analysis_test.rb +6 -4
- data/hadoop-rubydsl.gemspec +2 -2
- data/lib/core.rb +3 -2
- data/lib/log_analysis.rb +9 -1
- data/lib/mapred_factory.rb +19 -8
- data/lib/util.rb +22 -6
- data/spec/log_analysis_spec.rb +9 -0
- data/spec/mapred_factory_spec.rb +12 -0
- data/spec/util_spec.rb +19 -3
- metadata +2 -2
data/README.rdoc
CHANGED
@@ -26,7 +26,7 @@ You can get Hadoop job results in your hdfs wc/outputs/part-*
|
|
26
26
|
== Examples
|
27
27
|
|
28
28
|
Word Count DSL script
|
29
|
-
|
29
|
+
dsl 'WordCount'
|
30
30
|
|
31
31
|
from 'wc/inputs'
|
32
32
|
to 'wc/outputs'
|
@@ -35,7 +35,7 @@ Word Count DSL script
|
|
35
35
|
total :bytes, :words, :lines
|
36
36
|
|
37
37
|
Log Analysis DSL script
|
38
|
-
|
38
|
+
dsl 'LogAnalysis'
|
39
39
|
|
40
40
|
data 'apache log on test2' do
|
41
41
|
from 'apachelog/inputs'
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.5
|
@@ -23,14 +23,16 @@ data 'apache log on test2' do
|
|
23
23
|
|
24
24
|
topic 'ua counts group by path' do
|
25
25
|
request = column[:request].value
|
26
|
-
|
27
|
-
|
26
|
+
if request
|
27
|
+
path = request.split(/\s+/)[1]
|
28
|
+
group_by path
|
29
|
+
end
|
28
30
|
count_uniq column[:ua]
|
29
31
|
end
|
30
32
|
|
31
33
|
topic 'ua counts by daily' do
|
32
|
-
group_date_by column[:access_date], :daily
|
33
|
-
count_uniq column[:ua]
|
34
|
+
# group_date_by column[:access_date], :daily
|
35
|
+
# count_uniq column[:ua]
|
34
36
|
end
|
35
37
|
|
36
38
|
# topic 'total bytes' do
|
data/hadoop-rubydsl.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{hadoop-rubydsl}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Koichi Fujikawa"]
|
12
|
-
s.date = %q{2010-01-
|
12
|
+
s.date = %q{2010-01-28}
|
13
13
|
s.description = %q{Hadoop Ruby DSL}
|
14
14
|
s.email = %q{fujibee@gmail.com}
|
15
15
|
s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
|
data/lib/core.rb
CHANGED
@@ -5,8 +5,9 @@ module HadoopDsl
|
|
5
5
|
# common
|
6
6
|
module DslElement
|
7
7
|
# all DSL statements without def is processed here
|
8
|
-
def method_missing(
|
9
|
-
|
8
|
+
def method_missing(name, *args)
|
9
|
+
# if block given, labeled for non-local exit
|
10
|
+
catch name do; yield end if block_given?
|
10
11
|
self
|
11
12
|
end
|
12
13
|
end
|
data/lib/log_analysis.rb
CHANGED
@@ -9,6 +9,8 @@ module HadoopDsl::LogAnalysis
|
|
9
9
|
|
10
10
|
# controller
|
11
11
|
class LogAnalysisMapper < HadoopDsl::BaseMapper
|
12
|
+
@@reg_cache = {}
|
13
|
+
|
12
14
|
def initialize(script, key, value)
|
13
15
|
super(script, LogAnalysisMapperModel.new(key, value))
|
14
16
|
end
|
@@ -27,10 +29,16 @@ module HadoopDsl::LogAnalysis
|
|
27
29
|
@model.create_or_replace_columns_with(parts) {|column, value| column.value = value}
|
28
30
|
end
|
29
31
|
|
30
|
-
def pattern(
|
32
|
+
def pattern(reg_str)
|
33
|
+
# try to get RE from cache
|
34
|
+
cached = @@reg_cache[reg_str]
|
35
|
+
re = cached ? @@reg_cache[reg_str] : Regexp.new(reg_str)
|
36
|
+
@@reg_cache[reg_str] ||= re # new cache
|
37
|
+
|
31
38
|
if value =~ re
|
32
39
|
md = Regexp.last_match
|
33
40
|
@model.create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
|
41
|
+
else throw :each_line # non-local exit
|
34
42
|
end
|
35
43
|
end
|
36
44
|
|
data/lib/mapred_factory.rb
CHANGED
@@ -15,20 +15,31 @@ module HadoopDsl
|
|
15
15
|
end
|
16
16
|
|
17
17
|
class MapperFactory < MapRedFactory
|
18
|
+
# for cache in map loop
|
19
|
+
@@mapper_class = nil
|
18
20
|
def self.create(script, key, value)
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
21
|
+
# once decide in map loop
|
22
|
+
unless @@mapper_class
|
23
|
+
dsl_name = self.dsl_name(script)
|
24
|
+
require_dsl_lib(dsl_name)
|
25
|
+
@@mapper_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Mapper")
|
26
|
+
end
|
27
|
+
|
28
|
+
@@mapper_class.new(script, key, value)
|
23
29
|
end
|
24
30
|
end
|
25
31
|
|
26
32
|
class ReducerFactory < MapRedFactory
|
33
|
+
@@reducer_class = nil
|
27
34
|
def self.create(script, key, values)
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
35
|
+
# once decide in reduce loop
|
36
|
+
unless @@reducer_class
|
37
|
+
dsl_name = self.dsl_name(script)
|
38
|
+
require_dsl_lib(dsl_name)
|
39
|
+
@@reducer_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Reducer")
|
40
|
+
end
|
41
|
+
|
42
|
+
@@reducer_class.new(script, key, values)
|
32
43
|
end
|
33
44
|
end
|
34
45
|
|
data/lib/util.rb
CHANGED
@@ -2,21 +2,37 @@
|
|
2
2
|
require 'hadoop_dsl'
|
3
3
|
|
4
4
|
module HadoopDsl
|
5
|
+
# file body cache
|
6
|
+
# reading file in map/reduce cause critical issues!
|
7
|
+
@@file_bodies = {}
|
8
|
+
|
5
9
|
def self.snake_case(str)
|
6
10
|
str.gsub(/\B[A-Z]/, '_\&').downcase
|
7
11
|
end
|
8
12
|
|
9
13
|
def self.read_file(file_name)
|
14
|
+
# use if cached
|
15
|
+
body = @@file_bodies[file_name] if @@file_bodies[file_name]
|
16
|
+
|
10
17
|
# read as usual
|
11
|
-
body = File.open(file_name).read rescue nil
|
12
|
-
return body if body
|
18
|
+
body = File.open(file_name).read rescue nil unless body
|
13
19
|
|
14
20
|
# read from loadpath
|
15
|
-
|
16
|
-
|
17
|
-
|
21
|
+
unless body
|
22
|
+
$:.each do |path|
|
23
|
+
body = File.open(File.join(path, file_name)).read rescue next
|
24
|
+
break
|
25
|
+
end
|
18
26
|
end
|
19
27
|
|
20
|
-
raise "cannot find file - #{file_name}"
|
28
|
+
raise "cannot find file - #{file_name}" unless body
|
29
|
+
|
30
|
+
# for cache
|
31
|
+
@@file_bodies[file_name] = body
|
32
|
+
body
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.reset_dsl_file
|
36
|
+
@@file_bodies = {}
|
21
37
|
end
|
22
38
|
end
|
data/spec/log_analysis_spec.rb
CHANGED
@@ -23,6 +23,15 @@ describe LogAnalysisMapper do
|
|
23
23
|
mapper.column[2].value.should == 'frank'
|
24
24
|
end
|
25
25
|
|
26
|
+
it 'should non-local exit if cannot separate by pattern' do
|
27
|
+
mapper = LogAnalysisMapper.new(nil, nil, @apache_log + " a")
|
28
|
+
mapper.each_line do
|
29
|
+
mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)$/
|
30
|
+
fail 'should not be reached'
|
31
|
+
end
|
32
|
+
mapper.column[0].should be_nil
|
33
|
+
end
|
34
|
+
|
26
35
|
it 'should label column name by string' do
|
27
36
|
mapper = LogAnalysisMapper.new(nil, nil, @apache_log)
|
28
37
|
mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
|
data/spec/mapred_factory_spec.rb
CHANGED
@@ -59,4 +59,16 @@ describe 'MapRed Factory' do
|
|
59
59
|
mapper = MapperFactory.create(script, nil, nil)
|
60
60
|
mapper.class.should == LogAnalysis::LogAnalysisMapper
|
61
61
|
end
|
62
|
+
|
63
|
+
it 'can create mapper from class name cache' do
|
64
|
+
mapper = MapperFactory.create(@script, nil, nil)
|
65
|
+
mapper2 = MapperFactory.create(@script, nil, nil)
|
66
|
+
mapper.class.should == mapper2.class
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'can create reducer from class name cache' do
|
70
|
+
reducer = ReducerFactory.create(@script, nil, nil)
|
71
|
+
reducer2 = ReducerFactory.create(@script, nil, nil)
|
72
|
+
reducer.class.should == reducer2.class
|
73
|
+
end
|
62
74
|
end
|
data/spec/util_spec.rb
CHANGED
@@ -2,17 +2,33 @@ require File.join(File.dirname(__FILE__) , 'spec_helper')
|
|
2
2
|
require 'util'
|
3
3
|
|
4
4
|
describe 'utilities' do
|
5
|
+
before do
|
6
|
+
HadoopDsl.reset_dsl_file
|
7
|
+
@script_body = 'This is a script body.'
|
8
|
+
@script = create_tmp_script(@script_body)
|
9
|
+
end
|
10
|
+
|
5
11
|
it 'can change camelcase str to snakecase' do
|
6
12
|
HadoopDsl.snake_case('CamelCaseStr').should == 'camel_case_str'
|
7
13
|
end
|
8
14
|
|
9
15
|
it 'can read file and get file data to string' do
|
10
|
-
|
11
|
-
@script = create_tmp_script(script_body)
|
12
|
-
HadoopDsl.read_file(@script).should == script_body
|
16
|
+
HadoopDsl.read_file(@script).should == @script_body
|
13
17
|
end
|
14
18
|
|
15
19
|
it 'raise error if no file in loadpath' do
|
16
20
|
lambda { HadoopDsl.read_file('not_exists_on_loadpath') }.should raise_error
|
17
21
|
end
|
22
|
+
|
23
|
+
it 'can load from cache if script is loaded' do
|
24
|
+
HadoopDsl.read_file(@script).should == @script_body
|
25
|
+
File.delete(@script)
|
26
|
+
HadoopDsl.read_file(@script).should == @script_body
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'can load from each cache even if one script is loaded' do
|
30
|
+
HadoopDsl.read_file(@script).should == @script_body
|
31
|
+
another_script = create_tmp_script("another")
|
32
|
+
HadoopDsl.read_file(another_script).should == "another"
|
33
|
+
end
|
18
34
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hadoop-rubydsl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Koichi Fujikawa
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-01-
|
12
|
+
date: 2010-01-28 00:00:00 +09:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|