hadoop-rubydsl 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +2 -2
- data/VERSION +1 -1
- data/examples/log_analysis_test.rb +6 -4
- data/hadoop-rubydsl.gemspec +2 -2
- data/lib/core.rb +3 -2
- data/lib/log_analysis.rb +9 -1
- data/lib/mapred_factory.rb +19 -8
- data/lib/util.rb +22 -6
- data/spec/log_analysis_spec.rb +9 -0
- data/spec/mapred_factory_spec.rb +12 -0
- data/spec/util_spec.rb +19 -3
- metadata +2 -2
data/README.rdoc
CHANGED
@@ -26,7 +26,7 @@ You can get Hadoop job results in your hdfs wc/outputs/part-*
|
|
26
26
|
== Examples
|
27
27
|
|
28
28
|
Word Count DSL script
|
29
|
-
|
29
|
+
dsl 'WordCount'
|
30
30
|
|
31
31
|
from 'wc/inputs'
|
32
32
|
to 'wc/outputs'
|
@@ -35,7 +35,7 @@ Word Count DSL script
|
|
35
35
|
total :bytes, :words, :lines
|
36
36
|
|
37
37
|
Log Analysis DSL script
|
38
|
-
|
38
|
+
dsl 'LogAnalysis'
|
39
39
|
|
40
40
|
data 'apache log on test2' do
|
41
41
|
from 'apachelog/inputs'
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.5
|
@@ -23,14 +23,16 @@ data 'apache log on test2' do
|
|
23
23
|
|
24
24
|
topic 'ua counts group by path' do
|
25
25
|
request = column[:request].value
|
26
|
-
|
27
|
-
|
26
|
+
if request
|
27
|
+
path = request.split(/\s+/)[1]
|
28
|
+
group_by path
|
29
|
+
end
|
28
30
|
count_uniq column[:ua]
|
29
31
|
end
|
30
32
|
|
31
33
|
topic 'ua counts by daily' do
|
32
|
-
group_date_by column[:access_date], :daily
|
33
|
-
count_uniq column[:ua]
|
34
|
+
# group_date_by column[:access_date], :daily
|
35
|
+
# count_uniq column[:ua]
|
34
36
|
end
|
35
37
|
|
36
38
|
# topic 'total bytes' do
|
data/hadoop-rubydsl.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{hadoop-rubydsl}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Koichi Fujikawa"]
|
12
|
-
s.date = %q{2010-01-
|
12
|
+
s.date = %q{2010-01-28}
|
13
13
|
s.description = %q{Hadoop Ruby DSL}
|
14
14
|
s.email = %q{fujibee@gmail.com}
|
15
15
|
s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
|
data/lib/core.rb
CHANGED
@@ -5,8 +5,9 @@ module HadoopDsl
|
|
5
5
|
# common
|
6
6
|
module DslElement
|
7
7
|
# all DSL statements without def is processed here
|
8
|
-
def method_missing(
|
9
|
-
|
8
|
+
def method_missing(name, *args)
|
9
|
+
# if block given, labeled for non-local exit
|
10
|
+
catch name do; yield end if block_given?
|
10
11
|
self
|
11
12
|
end
|
12
13
|
end
|
data/lib/log_analysis.rb
CHANGED
@@ -9,6 +9,8 @@ module HadoopDsl::LogAnalysis
|
|
9
9
|
|
10
10
|
# controller
|
11
11
|
class LogAnalysisMapper < HadoopDsl::BaseMapper
|
12
|
+
@@reg_cache = {}
|
13
|
+
|
12
14
|
def initialize(script, key, value)
|
13
15
|
super(script, LogAnalysisMapperModel.new(key, value))
|
14
16
|
end
|
@@ -27,10 +29,16 @@ module HadoopDsl::LogAnalysis
|
|
27
29
|
@model.create_or_replace_columns_with(parts) {|column, value| column.value = value}
|
28
30
|
end
|
29
31
|
|
30
|
-
def pattern(
|
32
|
+
def pattern(reg_str)
|
33
|
+
# try to get RE from cache
|
34
|
+
cached = @@reg_cache[reg_str]
|
35
|
+
re = cached ? @@reg_cache[reg_str] : Regexp.new(reg_str)
|
36
|
+
@@reg_cache[reg_str] ||= re # new cache
|
37
|
+
|
31
38
|
if value =~ re
|
32
39
|
md = Regexp.last_match
|
33
40
|
@model.create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
|
41
|
+
else throw :each_line # non-local exit
|
34
42
|
end
|
35
43
|
end
|
36
44
|
|
data/lib/mapred_factory.rb
CHANGED
@@ -15,20 +15,31 @@ module HadoopDsl
|
|
15
15
|
end
|
16
16
|
|
17
17
|
class MapperFactory < MapRedFactory
|
18
|
+
# for cache in map loop
|
19
|
+
@@mapper_class = nil
|
18
20
|
def self.create(script, key, value)
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
21
|
+
# once decide in map loop
|
22
|
+
unless @@mapper_class
|
23
|
+
dsl_name = self.dsl_name(script)
|
24
|
+
require_dsl_lib(dsl_name)
|
25
|
+
@@mapper_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Mapper")
|
26
|
+
end
|
27
|
+
|
28
|
+
@@mapper_class.new(script, key, value)
|
23
29
|
end
|
24
30
|
end
|
25
31
|
|
26
32
|
class ReducerFactory < MapRedFactory
|
33
|
+
@@reducer_class = nil
|
27
34
|
def self.create(script, key, values)
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
35
|
+
# once decide in reduce loop
|
36
|
+
unless @@reducer_class
|
37
|
+
dsl_name = self.dsl_name(script)
|
38
|
+
require_dsl_lib(dsl_name)
|
39
|
+
@@reducer_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Reducer")
|
40
|
+
end
|
41
|
+
|
42
|
+
@@reducer_class.new(script, key, values)
|
32
43
|
end
|
33
44
|
end
|
34
45
|
|
data/lib/util.rb
CHANGED
@@ -2,21 +2,37 @@
|
|
2
2
|
require 'hadoop_dsl'
|
3
3
|
|
4
4
|
module HadoopDsl
|
5
|
+
# file body cache
|
6
|
+
# reading file in map/reduce cause critical issues!
|
7
|
+
@@file_bodies = {}
|
8
|
+
|
5
9
|
def self.snake_case(str)
|
6
10
|
str.gsub(/\B[A-Z]/, '_\&').downcase
|
7
11
|
end
|
8
12
|
|
9
13
|
def self.read_file(file_name)
|
14
|
+
# use if cached
|
15
|
+
body = @@file_bodies[file_name] if @@file_bodies[file_name]
|
16
|
+
|
10
17
|
# read as usual
|
11
|
-
body = File.open(file_name).read rescue nil
|
12
|
-
return body if body
|
18
|
+
body = File.open(file_name).read rescue nil unless body
|
13
19
|
|
14
20
|
# read from loadpath
|
15
|
-
|
16
|
-
|
17
|
-
|
21
|
+
unless body
|
22
|
+
$:.each do |path|
|
23
|
+
body = File.open(File.join(path, file_name)).read rescue next
|
24
|
+
break
|
25
|
+
end
|
18
26
|
end
|
19
27
|
|
20
|
-
raise "cannot find file - #{file_name}"
|
28
|
+
raise "cannot find file - #{file_name}" unless body
|
29
|
+
|
30
|
+
# for cache
|
31
|
+
@@file_bodies[file_name] = body
|
32
|
+
body
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.reset_dsl_file
|
36
|
+
@@file_bodies = {}
|
21
37
|
end
|
22
38
|
end
|
data/spec/log_analysis_spec.rb
CHANGED
@@ -23,6 +23,15 @@ describe LogAnalysisMapper do
|
|
23
23
|
mapper.column[2].value.should == 'frank'
|
24
24
|
end
|
25
25
|
|
26
|
+
it 'should non-local exit if cannot separate by pattern' do
|
27
|
+
mapper = LogAnalysisMapper.new(nil, nil, @apache_log + " a")
|
28
|
+
mapper.each_line do
|
29
|
+
mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)$/
|
30
|
+
fail 'should not be reached'
|
31
|
+
end
|
32
|
+
mapper.column[0].should be_nil
|
33
|
+
end
|
34
|
+
|
26
35
|
it 'should label column name by string' do
|
27
36
|
mapper = LogAnalysisMapper.new(nil, nil, @apache_log)
|
28
37
|
mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
|
data/spec/mapred_factory_spec.rb
CHANGED
@@ -59,4 +59,16 @@ describe 'MapRed Factory' do
|
|
59
59
|
mapper = MapperFactory.create(script, nil, nil)
|
60
60
|
mapper.class.should == LogAnalysis::LogAnalysisMapper
|
61
61
|
end
|
62
|
+
|
63
|
+
it 'can create mapper from class name cache' do
|
64
|
+
mapper = MapperFactory.create(@script, nil, nil)
|
65
|
+
mapper2 = MapperFactory.create(@script, nil, nil)
|
66
|
+
mapper.class.should == mapper2.class
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'can create reducer from class name cache' do
|
70
|
+
reducer = ReducerFactory.create(@script, nil, nil)
|
71
|
+
reducer2 = ReducerFactory.create(@script, nil, nil)
|
72
|
+
reducer.class.should == reducer2.class
|
73
|
+
end
|
62
74
|
end
|
data/spec/util_spec.rb
CHANGED
@@ -2,17 +2,33 @@ require File.join(File.dirname(__FILE__) , 'spec_helper')
|
|
2
2
|
require 'util'
|
3
3
|
|
4
4
|
describe 'utilities' do
|
5
|
+
before do
|
6
|
+
HadoopDsl.reset_dsl_file
|
7
|
+
@script_body = 'This is a script body.'
|
8
|
+
@script = create_tmp_script(@script_body)
|
9
|
+
end
|
10
|
+
|
5
11
|
it 'can change camelcase str to snakecase' do
|
6
12
|
HadoopDsl.snake_case('CamelCaseStr').should == 'camel_case_str'
|
7
13
|
end
|
8
14
|
|
9
15
|
it 'can read file and get file data to string' do
|
10
|
-
|
11
|
-
@script = create_tmp_script(script_body)
|
12
|
-
HadoopDsl.read_file(@script).should == script_body
|
16
|
+
HadoopDsl.read_file(@script).should == @script_body
|
13
17
|
end
|
14
18
|
|
15
19
|
it 'raise error if no file in loadpath' do
|
16
20
|
lambda { HadoopDsl.read_file('not_exists_on_loadpath') }.should raise_error
|
17
21
|
end
|
22
|
+
|
23
|
+
it 'can load from cache if script is loaded' do
|
24
|
+
HadoopDsl.read_file(@script).should == @script_body
|
25
|
+
File.delete(@script)
|
26
|
+
HadoopDsl.read_file(@script).should == @script_body
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'can load from each cache even if one script is loaded' do
|
30
|
+
HadoopDsl.read_file(@script).should == @script_body
|
31
|
+
another_script = create_tmp_script("another")
|
32
|
+
HadoopDsl.read_file(another_script).should == "another"
|
33
|
+
end
|
18
34
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hadoop-rubydsl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Koichi Fujikawa
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-01-
|
12
|
+
date: 2010-01-28 00:00:00 +09:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|