hadoop-rubydsl 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,7 +26,7 @@ You can get Hadoop job results in your hdfs wc/outputs/part-*
26
26
  == Examples
27
27
 
28
28
  Word Count DSL script
29
- use 'WordCount'
29
+ dsl 'WordCount'
30
30
 
31
31
  from 'wc/inputs'
32
32
  to 'wc/outputs'
@@ -35,7 +35,7 @@ Word Count DSL script
35
35
  total :bytes, :words, :lines
36
36
 
37
37
  Log Analysis DSL script
38
- use 'LogAnalysis'
38
+ dsl 'LogAnalysis'
39
39
 
40
40
  data 'apache log on test2' do
41
41
  from 'apachelog/inputs'
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.4
1
+ 0.0.5
@@ -23,14 +23,16 @@ data 'apache log on test2' do
23
23
 
24
24
  topic 'ua counts group by path' do
25
25
  request = column[:request].value
26
- path = request.split(/\s+/)[1]
27
- group_by path
26
+ if request
27
+ path = request.split(/\s+/)[1]
28
+ group_by path
29
+ end
28
30
  count_uniq column[:ua]
29
31
  end
30
32
 
31
33
  topic 'ua counts by daily' do
32
- group_date_by column[:access_date], :daily
33
- count_uniq column[:ua]
34
+ # group_date_by column[:access_date], :daily
35
+ # count_uniq column[:ua]
34
36
  end
35
37
 
36
38
  # topic 'total bytes' do
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{hadoop-rubydsl}
8
- s.version = "0.0.4"
8
+ s.version = "0.0.5"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Koichi Fujikawa"]
12
- s.date = %q{2010-01-13}
12
+ s.date = %q{2010-01-28}
13
13
  s.description = %q{Hadoop Ruby DSL}
14
14
  s.email = %q{fujibee@gmail.com}
15
15
  s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
@@ -5,8 +5,9 @@ module HadoopDsl
5
5
  # common
6
6
  module DslElement
7
7
  # all DSL statements without def is processed here
8
- def method_missing(method_name, *args)
9
- yield if block_given?
8
+ def method_missing(name, *args)
9
+ # if block given, labeled for non-local exit
10
+ catch name do; yield end if block_given?
10
11
  self
11
12
  end
12
13
  end
@@ -9,6 +9,8 @@ module HadoopDsl::LogAnalysis
9
9
 
10
10
  # controller
11
11
  class LogAnalysisMapper < HadoopDsl::BaseMapper
12
+ @@reg_cache = {}
13
+
12
14
  def initialize(script, key, value)
13
15
  super(script, LogAnalysisMapperModel.new(key, value))
14
16
  end
@@ -27,10 +29,16 @@ module HadoopDsl::LogAnalysis
27
29
  @model.create_or_replace_columns_with(parts) {|column, value| column.value = value}
28
30
  end
29
31
 
30
- def pattern(re)
32
+ def pattern(reg_str)
33
+ # try to get RE from cache
34
+ cached = @@reg_cache[reg_str]
35
+ re = cached ? @@reg_cache[reg_str] : Regexp.new(reg_str)
36
+ @@reg_cache[reg_str] ||= re # new cache
37
+
31
38
  if value =~ re
32
39
  md = Regexp.last_match
33
40
  @model.create_or_replace_columns_with(md.captures) {|column, value| column.value = value}
41
+ else throw :each_line # non-local exit
34
42
  end
35
43
  end
36
44
 
@@ -15,20 +15,31 @@ module HadoopDsl
15
15
  end
16
16
 
17
17
  class MapperFactory < MapRedFactory
18
+ # for cache in map loop
19
+ @@mapper_class = nil
18
20
  def self.create(script, key, value)
19
- dsl_name = self.dsl_name(script)
20
- require_dsl_lib(dsl_name)
21
- mapper_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Mapper"
22
- return eval(mapper_class).new(script, key, value)
21
+ # once decide in map loop
22
+ unless @@mapper_class
23
+ dsl_name = self.dsl_name(script)
24
+ require_dsl_lib(dsl_name)
25
+ @@mapper_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Mapper")
26
+ end
27
+
28
+ @@mapper_class.new(script, key, value)
23
29
  end
24
30
  end
25
31
 
26
32
  class ReducerFactory < MapRedFactory
33
+ @@reducer_class = nil
27
34
  def self.create(script, key, values)
28
- dsl_name = self.dsl_name(script)
29
- require_dsl_lib(dsl_name)
30
- reducer_class = "HadoopDsl::#{dsl_name}::#{dsl_name}Reducer"
31
- return eval(reducer_class).new(script, key, values)
35
+ # once decide in reduce loop
36
+ unless @@reducer_class
37
+ dsl_name = self.dsl_name(script)
38
+ require_dsl_lib(dsl_name)
39
+ @@reducer_class = eval("HadoopDsl::#{dsl_name}::#{dsl_name}Reducer")
40
+ end
41
+
42
+ @@reducer_class.new(script, key, values)
32
43
  end
33
44
  end
34
45
 
@@ -2,21 +2,37 @@
2
2
  require 'hadoop_dsl'
3
3
 
4
4
  module HadoopDsl
5
+ # file body cache
6
+ # reading file in map/reduce cause critical issues!
7
+ @@file_bodies = {}
8
+
5
9
  def self.snake_case(str)
6
10
  str.gsub(/\B[A-Z]/, '_\&').downcase
7
11
  end
8
12
 
9
13
  def self.read_file(file_name)
14
+ # use if cached
15
+ body = @@file_bodies[file_name] if @@file_bodies[file_name]
16
+
10
17
  # read as usual
11
- body = File.open(file_name).read rescue nil
12
- return body if body
18
+ body = File.open(file_name).read rescue nil unless body
13
19
 
14
20
  # read from loadpath
15
- $:.each do |path|
16
- body = File.open(File.join(path, file_name)).read rescue next
17
- return body if body
21
+ unless body
22
+ $:.each do |path|
23
+ body = File.open(File.join(path, file_name)).read rescue next
24
+ break
25
+ end
18
26
  end
19
27
 
20
- raise "cannot find file - #{file_name}"
28
+ raise "cannot find file - #{file_name}" unless body
29
+
30
+ # for cache
31
+ @@file_bodies[file_name] = body
32
+ body
33
+ end
34
+
35
+ def self.reset_dsl_file
36
+ @@file_bodies = {}
21
37
  end
22
38
  end
@@ -23,6 +23,15 @@ describe LogAnalysisMapper do
23
23
  mapper.column[2].value.should == 'frank'
24
24
  end
25
25
 
26
+ it 'should non-local exit if cannot separate by pattern' do
27
+ mapper = LogAnalysisMapper.new(nil, nil, @apache_log + " a")
28
+ mapper.each_line do
29
+ mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)$/
30
+ fail 'should not be reached'
31
+ end
32
+ mapper.column[0].should be_nil
33
+ end
34
+
26
35
  it 'should label column name by string' do
27
36
  mapper = LogAnalysisMapper.new(nil, nil, @apache_log)
28
37
  mapper.pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
@@ -59,4 +59,16 @@ describe 'MapRed Factory' do
59
59
  mapper = MapperFactory.create(script, nil, nil)
60
60
  mapper.class.should == LogAnalysis::LogAnalysisMapper
61
61
  end
62
+
63
+ it 'can create mapper from class name cache' do
64
+ mapper = MapperFactory.create(@script, nil, nil)
65
+ mapper2 = MapperFactory.create(@script, nil, nil)
66
+ mapper.class.should == mapper2.class
67
+ end
68
+
69
+ it 'can create reducer from class name cache' do
70
+ reducer = ReducerFactory.create(@script, nil, nil)
71
+ reducer2 = ReducerFactory.create(@script, nil, nil)
72
+ reducer.class.should == reducer2.class
73
+ end
62
74
  end
@@ -2,17 +2,33 @@ require File.join(File.dirname(__FILE__) , 'spec_helper')
2
2
  require 'util'
3
3
 
4
4
  describe 'utilities' do
5
+ before do
6
+ HadoopDsl.reset_dsl_file
7
+ @script_body = 'This is a script body.'
8
+ @script = create_tmp_script(@script_body)
9
+ end
10
+
5
11
  it 'can change camelcase str to snakecase' do
6
12
  HadoopDsl.snake_case('CamelCaseStr').should == 'camel_case_str'
7
13
  end
8
14
 
9
15
  it 'can read file and get file data to string' do
10
- script_body = 'This is a script body.'
11
- @script = create_tmp_script(script_body)
12
- HadoopDsl.read_file(@script).should == script_body
16
+ HadoopDsl.read_file(@script).should == @script_body
13
17
  end
14
18
 
15
19
  it 'raise error if no file in loadpath' do
16
20
  lambda { HadoopDsl.read_file('not_exists_on_loadpath') }.should raise_error
17
21
  end
22
+
23
+ it 'can load from cache if script is loaded' do
24
+ HadoopDsl.read_file(@script).should == @script_body
25
+ File.delete(@script)
26
+ HadoopDsl.read_file(@script).should == @script_body
27
+ end
28
+
29
+ it 'can load from each cache even if one script is loaded' do
30
+ HadoopDsl.read_file(@script).should == @script_body
31
+ another_script = create_tmp_script("another")
32
+ HadoopDsl.read_file(another_script).should == "another"
33
+ end
18
34
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hadoop-rubydsl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-13 00:00:00 +09:00
12
+ date: 2010-01-28 00:00:00 +09:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency