hadoop-rubydsl 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,7 +26,7 @@ You can get Hadoop job results in your hdfs wc/outputs/part-*
26
26
  == Examples
27
27
 
28
28
  Word Count DSL script
29
- use 'WordCount'
29
+ dsl 'WordCount'
30
30
 
31
31
  from 'wc/inputs'
32
32
  to 'wc/outputs'
@@ -35,7 +35,7 @@ Word Count DSL script
35
35
  total :bytes, :words, :lines
36
36
 
37
37
  Log Analysis DSL script
38
- use 'LogAnalysis'
38
+ dsl 'LogAnalysis'
39
39
 
40
40
  data 'apache log on test2' do
41
41
  from 'apachelog/inputs'
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.4
1
+ 0.0.5
@@ -23,14 +23,16 @@ data 'apache log on test2' do
23
23
 
24
24
  topic 'ua counts group by path' do
25
25
  request = column[:request].value
26
- path = request.split(/\s+/)[1]
27
- group_by path
26
+ if request
27
+ path = request.split(/\s+/)[1]
28
+ group_by path
29
+ end
28
30
  count_uniq column[:ua]
29
31
  end
30
32
 
31
33
  topic 'ua counts by daily' do
32
- group_date_by column[:access_date], :daily
33
- count_uniq column[:ua]
34
+ # group_date_by column[:access_date], :daily
35
+ # count_uniq column[:ua]
34
36
  end
35
37
 
36
38
  # topic 'total bytes' do
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{hadoop-rubydsl}
8
- s.version = "0.0.4"
8
+ s.version = "0.0.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Koichi Fujikawa"]
12
- s.date = %q{2010-01-13}
12
+ s.date = %q{2010-01-28}
13
13
  s.description = %q{Hadoop Ruby DSL}
14
14
  s.email = %q{fujibee@gmail.com}
15
15
  s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
@@ -5,8 +5,9 @@ module HadoopDsl
5
5
  # common
6
6
  module DslElement
7
7
  # all DSL statements without def is processed here
8
- def method_missing(method_name, *args)
9
- yield if block_given?
8
+ def method_missing(name, *args)
9
+ # if block given, labeled for non-local exit
10
+ catch name do; yield end if block_given?
10
11
  self
11
12
  end
12
13
  end
@@ -9,6 +9,8 @@ module HadoopDsl::LogAnalysis
9
9
 
10
10
  # controller
11
11
  class LogAnalysisMapper < HadoopDsl::BaseMapper
12
+ @@reg_cache = {}
13
+
12
14
  def initialize(script, key, value)
13
15
  super(script, LogAnalysisMapperModel.new(key, value))
14
16
  end
@@ -27,10 +29,16 @@ module HadoopDsl::LogAnalysis
27
29
  @model.create_or_replace_columns_with(parts) {|column, value| column.value = value}
28
30
  end
29
31
 
30
- def pattern(re)
32
+ def pattern(reg_str)
33
+ # try to get RE from cache
34
+ cached = @@reg_cache[reg_str]
35
+ re = cached ? @@reg_cache[reg_str] : Regexp.new(reg_str)
36
+ @@reg_cache[reg_str] ||= re # new cache
37
+
31
38
  if value =~ re
32
39
  md = Regexp.last_match
33
40
  @model.create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
41
+ else throw :each_line # non-local exit
34
42
  end
35
43
  end
36
44
 
@@ -15,20 +15,31 @@ module HadoopDsl
15
15
  end
16
16
 
17
17
  class MapperFactory < MapRedFactory
18
+ # for cache in map loop
19
+ @@mapper_class = nil
18
20
  def self.create(script, key, value)
19
- dsl_name = self.dsl_name(script)
20
- require_dsl_lib(dsl_name)
21
- mapper_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Mapper"
22
- return eval(mapper_class).new(script, key, value)
21
+ # once decide in map loop
22
+ unless @@mapper_class
23
+ dsl_name = self.dsl_name(script)
24
+ require_dsl_lib(dsl_name)
25
+ @@mapper_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Mapper")
26
+ end
27
+
28
+ @@mapper_class.new(script, key, value)
23
29
  end
24
30
  end
25
31
 
26
32
  class ReducerFactory < MapRedFactory
33
+ @@reducer_class = nil
27
34
  def self.create(script, key, values)
28
- dsl_name = self.dsl_name(script)
29
- require_dsl_lib(dsl_name)
30
- reducer_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Reducer"
31
- return eval(reducer_class).new(script, key, values)
35
+ # once decide in reduce loop
36
+ unless @@reducer_class
37
+ dsl_name = self.dsl_name(script)
38
+ require_dsl_lib(dsl_name)
39
+ @@reducer_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Reducer")
40
+ end
41
+
42
+ @@reducer_class.new(script, key, values)
32
43
  end
33
44
  end
34
45
 
@@ -2,21 +2,37 @@
2
2
  require 'hadoop_dsl'
3
3
 
4
4
  module HadoopDsl
5
+ # file body cache
6
+ # reading file in map/reduce cause critical issues!
7
+ @@file_bodies = {}
8
+
5
9
  def self.snake_case(str)
6
10
  str.gsub(/\B[A-Z]/, '_\&').downcase
7
11
  end
8
12
 
9
13
  def self.read_file(file_name)
14
+ # use if cached
15
+ body = @@file_bodies[file_name] if @@file_bodies[file_name]
16
+
10
17
  # read as usual
11
- body = File.open(file_name).read rescue nil
12
- return body if body
18
+ body = File.open(file_name).read rescue nil unless body
13
19
 
14
20
  # read from loadpath
15
- $:.each do |path|
16
- body = File.open(File.join(path, file_name)).read rescue next
17
- return body if body
21
+ unless body
22
+ $:.each do |path|
23
+ body = File.open(File.join(path, file_name)).read rescue next
24
+ break
25
+ end
18
26
  end
19
27
 
20
- raise "cannot find file - #{file_name}"
28
+ raise "cannot find file - #{file_name}" unless body
29
+
30
+ # for cache
31
+ @@file_bodies[file_name] = body
32
+ body
33
+ end
34
+
35
+ def self.reset_dsl_file
36
+ @@file_bodies = {}
21
37
  end
22
38
  end
@@ -23,6 +23,15 @@ describe LogAnalysisMapper do
23
23
  mapper.column[2].value.should == 'frank'
24
24
  end
25
25
 
26
+ it 'should non-local exit if cannot separate by pattern' do
27
+ mapper = LogAnalysisMapper.new(nil, nil, @apache_log + " a")
28
+ mapper.each_line do
29
+ mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)$/
30
+ fail 'should not be reached'
31
+ end
32
+ mapper.column[0].should be_nil
33
+ end
34
+
26
35
  it 'should label column name by string' do
27
36
  mapper = LogAnalysisMapper.new(nil, nil, @apache_log)
28
37
  mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
@@ -59,4 +59,16 @@ describe 'MapRed Factory' do
59
59
  mapper = MapperFactory.create(script, nil, nil)
60
60
  mapper.class.should == LogAnalysis::LogAnalysisMapper
61
61
  end
62
+
63
+ it 'can create mapper from class name cache' do
64
+ mapper = MapperFactory.create(@script, nil, nil)
65
+ mapper2 = MapperFactory.create(@script, nil, nil)
66
+ mapper.class.should == mapper2.class
67
+ end
68
+
69
+ it 'can create reducer from class name cache' do
70
+ reducer = ReducerFactory.create(@script, nil, nil)
71
+ reducer2 = ReducerFactory.create(@script, nil, nil)
72
+ reducer.class.should == reducer2.class
73
+ end
62
74
  end
@@ -2,17 +2,33 @@ require File.join(File.dirname(__FILE__) , 'spec_helper')
2
2
  require 'util'
3
3
 
4
4
  describe 'utilities' do
5
+ before do
6
+ HadoopDsl.reset_dsl_file
7
+ @script_body = 'This is a script body.'
8
+ @script = create_tmp_script(@script_body)
9
+ end
10
+
5
11
  it 'can change camelcase str to snakecase' do
6
12
  HadoopDsl.snake_case('CamelCaseStr').should == 'camel_case_str'
7
13
  end
8
14
 
9
15
  it 'can read file and get file data to string' do
10
- script_body = 'This is a script body.'
11
- @script = create_tmp_script(script_body)
12
- HadoopDsl.read_file(@script).should == script_body
16
+ HadoopDsl.read_file(@script).should == @script_body
13
17
  end
14
18
 
15
19
  it 'raise error if no file in loadpath' do
16
20
  lambda { HadoopDsl.read_file('not_exists_on_loadpath') }.should raise_error
17
21
  end
22
+
23
+ it 'can load from cache if script is loaded' do
24
+ HadoopDsl.read_file(@script).should == @script_body
25
+ File.delete(@script)
26
+ HadoopDsl.read_file(@script).should == @script_body
27
+ end
28
+
29
+ it 'can load from each cache even if one script is loaded' do
30
+ HadoopDsl.read_file(@script).should == @script_body
31
+ another_script = create_tmp_script("another")
32
+ HadoopDsl.read_file(another_script).should == "another"
33
+ end
18
34
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hadoop-rubydsl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-13 00:00:00 +09:00
12
+ date: 2010-01-28 00:00:00 +09:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency