hadoop-rubydsl 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc ADDED
@@ -0,0 +1,58 @@
1
+ = hadoop-rubydsl
2
+
3
+ Enable to run Ruby DSL script on your Hadoop.
4
+
5
+ == Description
6
+
7
+ You can write DSL by Ruby to run Hadoop as Mapper / Reducer.
8
+ This gem depends on 'jruby-on-hadoop' project.
9
+
10
+ == Install
11
+
12
+ Required gems are all on GemCutter.
13
+
14
+ 1. Upgrade your rubygem to 1.3.5
15
+ 2. Install gems
16
+ $ gem install hadoop-rubydsl
17
+
18
+ == Usage
19
+
20
+ 1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
21
+ 2. put files into your hdfs. ex) wc/inputs/file1
22
+ 3. Now you can run 'hrd' like below:
23
+ $ hrd examples/word_count_test.rb
24
+ You can get Hadoop job results in your hdfs wc/outputs/part-*
25
+
26
+ == Examples
27
+
28
+ Word Count DSL script
29
+ use 'WordCount'
30
+
31
+ from 'wc/inputs'
32
+ to 'wc/outputs'
33
+
34
+ count_uniq
35
+ total :bytes, :words, :lines
36
+
37
+ Log Analysis DSL script
38
+ use 'LogAnalysis'
39
+
40
+ data 'apache log on test2' do
41
+ from 'apachelog/inputs'
42
+ to 'apachelog/outputs'
43
+
44
+ each_line do
45
+ pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
46
+ column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
47
+
48
+ topic 'ua counts', :label => 'ua' do
49
+ count_uniq column[:ua]
50
+ end
51
+ end
52
+ end
53
+
54
+ == Author
55
+ Koichi Fujikawa <fujibee@gmail.com>
56
+
57
+ == Copyright
58
+ License: Apache License
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.0.3
data/bin/hrd ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'hadoop-dsl'
4
+
5
+ HadoopDsl::Client.new(ARGV).run
@@ -5,26 +5,27 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{hadoop-rubydsl}
8
- s.version = "0.0.2"
8
+ s.version = "0.0.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Koichi Fujikawa"]
12
- s.date = %q{2009-12-28}
12
+ s.date = %q{2010-01-04}
13
13
  s.description = %q{Hadoop Ruby DSL}
14
14
  s.email = %q{fujibee@gmail.com}
15
- s.executables = ["hadoop-hudson.sh", "hadoop-ruby.sh"]
15
+ s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
16
16
  s.extra_rdoc_files = [
17
- "README",
17
+ "README.rdoc",
18
18
  "TODO"
19
19
  ]
20
20
  s.files = [
21
21
  ".gitignore",
22
- "README",
22
+ "README.rdoc",
23
23
  "Rakefile",
24
24
  "TODO",
25
25
  "VERSION",
26
26
  "bin/hadoop-hudson.sh",
27
27
  "bin/hadoop-ruby.sh",
28
+ "bin/hrd",
28
29
  "conf/hadoop-site.xml",
29
30
  "examples/apachelog-v2-2.rb",
30
31
  "examples/apachelog-v2.rb",
@@ -32,11 +33,11 @@ Gem::Specification.new do |s|
32
33
  "examples/hive_like_test.rb",
33
34
  "examples/word_count_test.rb",
34
35
  "hadoop-rubydsl.gemspec",
36
+ "lib/client.rb",
35
37
  "lib/core.rb",
38
+ "lib/dsl_init.rb",
39
+ "lib/hadoop-dsl.rb",
36
40
  "lib/hive_like.rb",
37
- "lib/init.rb",
38
- "lib/java/.gitignore",
39
- "lib/java/hadoop-ruby.jar",
40
41
  "lib/log_analysis.rb",
41
42
  "lib/mapred_factory.rb",
42
43
  "lib/util.rb",
@@ -49,14 +50,15 @@ Gem::Specification.new do |s|
49
50
  s.summary = %q{Hadoop Ruby DSL}
50
51
  s.test_files = [
51
52
  "spec/spec_helper.rb",
53
+ "spec/dsl_init_spec.rb",
52
54
  "spec/core_spec.rb",
55
+ "spec/client_spec.rb",
53
56
  "spec/util_spec.rb",
54
57
  "spec/mapred_factory_spec.rb",
55
58
  "spec/word_count_spec.rb",
56
59
  "spec/hive_like_spec.rb",
57
60
  "spec/log_analysis_spec.rb",
58
61
  "spec/example_spec.rb",
59
- "spec/init_spec.rb",
60
62
  "examples/apachelog-v2.rb",
61
63
  "examples/hive_like_test.rb",
62
64
  "examples/word_count_test.rb",
data/lib/client.rb ADDED
@@ -0,0 +1,27 @@
1
+ module HadoopDsl
2
+ class Client < JRubyOnHadoop::Client
3
+ def parse_args
4
+ super
5
+ @script_path = HadoopDsl.dsl_init_script
6
+ @script = File.basename(@script_path)
7
+ @dsl_file_path = @args[0]
8
+ @dsl_file = File.basename(@dsl_file_path)
9
+ @files << @script_path << @dsl_file_path
10
+
11
+ # TODO move properly, with jruby-on-hadoop
12
+ add_dsl_lib_files
13
+ ENV['RUBYLIB'] = File.dirname(@dsl_file_path)
14
+ end
15
+
16
+ def mapred_args
17
+ args = super
18
+ args += " --dslfile #{@dsl_file}"
19
+ args
20
+ end
21
+
22
+ def add_dsl_lib_files
23
+ lib_path = HadoopDsl.lib_path
24
+ @files += Dir.glob(File.join(lib_path, "*.rb"))
25
+ end
26
+ end
27
+ end
@@ -12,15 +12,14 @@ HadoopDsl::Text = Text
12
12
  HadoopDsl::IntWritable = IntWritable
13
13
 
14
14
  def map(key, value, output, reporter, script)
15
- mapper = MapperFactory.create(script, key.to_string, value.to_string)
15
+ mapper = MapperFactory.create(script, key, value)
16
16
  mapper.run
17
17
 
18
18
  write(output, mapper)
19
19
  end
20
20
 
21
21
  def reduce(key, values, output, reporter, script)
22
- ruby_values = values.map {|v| to_ruby(v)}
23
- reducer = ReducerFactory.create(script, key.to_string, ruby_values)
22
+ reducer = ReducerFactory.create(script, key, values)
24
23
  reducer.run
25
24
 
26
25
  write(output, reducer)
@@ -29,8 +28,7 @@ end
29
28
  def setup(conf, script)
30
29
  setup = SetupFactory.create(script, conf)
31
30
  setup.run
32
-
33
- setup.paths.to_java
31
+ setup.paths
34
32
  end
35
33
 
36
34
  private
@@ -38,23 +36,7 @@ private
38
36
  def write(output, controller)
39
37
  controller.emitted.each do |e|
40
38
  e.each do |k, v|
41
- output.collect(to_hadoop(k), to_hadoop(v))
39
+ output.collect(k, v)
42
40
  end
43
41
  end
44
42
  end
45
-
46
- def to_ruby(value)
47
- case value
48
- when IntWritable then value.get
49
- when Text then value.to_string
50
- else raise "no match class: #{value.class}"
51
- end
52
- end
53
-
54
- def to_hadoop(value)
55
- case value
56
- when Integer then IntWritable.new(value)
57
- when String then t = Text.new; t.set(value); t
58
- else raise "no match class: #{value.class}"
59
- end
60
- end
data/lib/hadoop-dsl.rb ADDED
@@ -0,0 +1,12 @@
1
+ require 'jruby-on-hadoop'
2
+ require 'client'
3
+
4
+ module HadoopDsl
5
+ def self.lib_path
6
+ File.expand_path(File.dirname(__FILE__))
7
+ end
8
+
9
+ def self.dsl_init_script
10
+ File.join(lib_path, "dsl_init.rb")
11
+ end
12
+ end
data/lib/util.rb CHANGED
@@ -6,6 +6,17 @@ module HadoopDsl
6
6
  end
7
7
 
8
8
  def read_file(file_name)
9
- File.open(file_name).read
9
+ # read as usual
10
+ body = File.open(file_name).read rescue nil
11
+ return body if body
12
+
13
+ # read from loadpath
14
+ $:.each do |path|
15
+ p path
16
+ body = File.open(File.join(path, file_name)).read rescue next
17
+ return body if body
18
+ end
19
+
20
+ raise "cannot find file - #{file_name}"
10
21
  end
11
22
  end
@@ -0,0 +1,26 @@
1
+ require 'hadoop-dsl'
2
+
3
+ describe HadoopDsl::Client do
4
+ before do
5
+ @client = HadoopDsl::Client.new(["examples/wordcount.rb", "in", "out"])
6
+ end
7
+
8
+ it 'can parse args' do
9
+ @client.files.join.should match /ruby_wrapper\.rb/
10
+ @client.files.join.should match /dsl_init\.rb/
11
+ @client.files.should include 'examples/wordcount.rb'
12
+ @client.inputs.should == 'in'
13
+ @client.outputs.should == 'out'
14
+ end
15
+
16
+ it 'can add dsl file into mapred args' do
17
+ @client.mapred_args.should ==
18
+ "--script dsl_init.rb in out --dslfile wordcount.rb"
19
+ end
20
+
21
+ it 'can add dsl lib files' do
22
+ lib_path = HadoopDsl.lib_path
23
+ @client.files.should include File.join(lib_path, 'core.rb')
24
+ @client.files.should include File.join(lib_path, 'log_analysis.rb')
25
+ end
26
+ end
data/spec/core_spec.rb CHANGED
@@ -1,4 +1,4 @@
1
- require 'init'
1
+ require 'dsl_init'
2
2
  require 'core'
3
3
 
4
4
  include HadoopDsl
@@ -1,9 +1,4 @@
1
- require 'java'
2
- require 'init'
3
-
4
- import 'org.apache.hadoop.io.IntWritable'
5
- import 'org.apache.hadoop.io.Text'
6
- import 'org.apache.hadoop.mapred.JobConf'
1
+ require 'dsl_init'
7
2
 
8
3
  describe 'mapreduce init' do
9
4
 
@@ -24,22 +19,20 @@ end
24
19
  end
25
20
 
26
21
  before do
27
- @one = IntWritable.new(1)
22
+ @one = 1
28
23
  @output = mock('output')
29
24
  end
30
25
 
31
26
  it 'can map sucessfully' do
32
- key, value = Text.new, Text.new
33
- key.set("key")
34
- value.set('it should be fine')
27
+ key = 'key'
28
+ value = 'it should be fine'
35
29
  @output.should_receive(:collect).once #.with(@text, @one)
36
30
 
37
31
  map(key, value, @output, nil, @script)
38
32
  end
39
33
 
40
34
  it 'can reduce sucessfully' do
41
- key, value = Text.new, Text.new
42
- key.set("t1\tkey")
35
+ key = "t1\tkey"
43
36
  values = [@one, @one, @one]
44
37
  @output.should_receive(:collect).once #.with(@text, @one)
45
38
 
@@ -47,7 +40,7 @@ end
47
40
  end
48
41
 
49
42
  it 'can set job conf' do
50
- conf = JobConf.new
43
+ conf = mock('jobconf')
51
44
  paths = setup(conf, @script)
52
45
 
53
46
  paths[0].should == 'test/inputs'
@@ -1,4 +1,3 @@
1
- require 'init'
2
1
  require 'core'
3
2
  require 'hive_like'
4
3
 
@@ -1,4 +1,3 @@
1
- require 'init'
2
1
  require 'core'
3
2
  require 'log_analysis'
4
3
 
data/spec/util_spec.rb CHANGED
@@ -12,4 +12,8 @@ describe 'utilities' do
12
12
  @script = create_tmp_script(script_body)
13
13
  read_file(@script).should == script_body
14
14
  end
15
+
16
+ it 'raise error if no file in loadpath' do
17
+ lambda { read_file('not_exists_on_loadpath') }.should raise_error
18
+ end
15
19
  end
@@ -1,4 +1,3 @@
1
- require 'init'
2
1
  require 'core'
3
2
  require 'word_count'
4
3
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hadoop-rubydsl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-28 00:00:00 +09:00
12
+ date: 2010-01-04 00:00:00 +09:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -25,21 +25,23 @@ dependencies:
25
25
  description: Hadoop Ruby DSL
26
26
  email: fujibee@gmail.com
27
27
  executables:
28
+ - hrd
28
29
  - hadoop-hudson.sh
29
30
  - hadoop-ruby.sh
30
31
  extensions: []
31
32
 
32
33
  extra_rdoc_files:
33
- - README
34
+ - README.rdoc
34
35
  - TODO
35
36
  files:
36
37
  - .gitignore
37
- - README
38
+ - README.rdoc
38
39
  - Rakefile
39
40
  - TODO
40
41
  - VERSION
41
42
  - bin/hadoop-hudson.sh
42
43
  - bin/hadoop-ruby.sh
44
+ - bin/hrd
43
45
  - conf/hadoop-site.xml
44
46
  - examples/apachelog-v2-2.rb
45
47
  - examples/apachelog-v2.rb
@@ -47,11 +49,11 @@ files:
47
49
  - examples/hive_like_test.rb
48
50
  - examples/word_count_test.rb
49
51
  - hadoop-rubydsl.gemspec
52
+ - lib/client.rb
50
53
  - lib/core.rb
54
+ - lib/dsl_init.rb
55
+ - lib/hadoop-dsl.rb
51
56
  - lib/hive_like.rb
52
- - lib/init.rb
53
- - lib/java/.gitignore
54
- - lib/java/hadoop-ruby.jar
55
57
  - lib/log_analysis.rb
56
58
  - lib/mapred_factory.rb
57
59
  - lib/util.rb
@@ -86,14 +88,15 @@ specification_version: 3
86
88
  summary: Hadoop Ruby DSL
87
89
  test_files:
88
90
  - spec/spec_helper.rb
91
+ - spec/dsl_init_spec.rb
89
92
  - spec/core_spec.rb
93
+ - spec/client_spec.rb
90
94
  - spec/util_spec.rb
91
95
  - spec/mapred_factory_spec.rb
92
96
  - spec/word_count_spec.rb
93
97
  - spec/hive_like_spec.rb
94
98
  - spec/log_analysis_spec.rb
95
99
  - spec/example_spec.rb
96
- - spec/init_spec.rb
97
100
  - examples/apachelog-v2.rb
98
101
  - examples/hive_like_test.rb
99
102
  - examples/word_count_test.rb
data/README DELETED
@@ -1,53 +0,0 @@
1
- = hadoop-rubydsl
2
-
3
- == Description
4
- HadoopのMapper/ReducerをRubyによるDSLで記述することができます。
5
- hadoop-ruby.jarを利用します。
6
-
7
- 例)
8
- apachelog.rb
9
-
10
- # log:
11
- # 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
12
- # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
13
- # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
14
-
15
- use 'LogAnalysis'
16
- data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
17
- column[2].count_uniq
18
- column[3].count_uniq
19
- column[4].count_uniq
20
- column[5].count_uniq
21
- column[6].sum
22
-
23
- =>
24
- col2 frank 1
25
- col2 frank2 2
26
- col3 [10/Oct/2000:13:55:36 -0700] 3
27
- col4 "GET /apache_pb.gif HTTP/1.0" 1
28
- col4 "GET /apache_pb2.gif HTTP/1.0" 1
29
- col4 "GET /apache_pb3.gif HTTP/1.0" 1
30
- col5 200 2
31
- col5 404 1
32
- col6 6978
33
-
34
- == Usage
35
- 0. HADOOP_HOMEを正しく設定し、Hadoopを一式立ち上げておく。
36
-
37
- 1. jruby-complete-*.jar を lib/java 以下にコピー
38
- ex)
39
- $ wget http://jruby.kenai.com/downloads/1.4.0RC2/jruby-complete-1.4.0RC2.jar
40
- $ cp jruby-complete-*.jar lib/java/
41
-
42
- 2. データを HDFS にアップロード
43
- ex)
44
- $ hadoop dfs -copyFromLocal apachelog inputs/
45
-
46
- 3. MapReduce実行
47
- $ bin/hadoop-ruby.sh examples/apachelog.rb inputs outputs
48
-
49
- == Author
50
- Koichi Fujikawa <fujibee@gmail.com>
51
-
52
- == Copyright
53
- License: Apache License
data/lib/java/.gitignore DELETED
@@ -1 +0,0 @@
1
- jruby-complete-*.jar
Binary file