hadoop-rubydsl 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc ADDED
@@ -0,0 +1,58 @@
1
+ = hadoop-rubydsl
2
+
3
+ Enable to run Ruby DSL script on your Hadoop.
4
+
5
+ == Description
6
+
7
+ You can write DSL by Ruby to run Hadoop as Mapper / Reducer.
8
+ This gem depends on 'jruby-on-hadoop' project.
9
+
10
+ == Install
11
+
12
+ Required gems are all on GemCutter.
13
+
14
+ 1. Upgrade your rubygem to 1.3.5
15
+ 2. Install gems
16
+ $ gem install hadoop-rubydsl
17
+
18
+ == Usage
19
+
20
+ 1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
21
+ 2. put files into your hdfs. ex) wc/inputs/file1
22
+ 3. Now you can run 'hrd' like below:
23
+ $ hrd examples/word_count_test.rb
24
+ You can get Hadoop job results in your hdfs wc/outputs/part-*
25
+
26
+ == Examples
27
+
28
+ Word Count DSL script
29
+ use 'WordCount'
30
+
31
+ from 'wc/inputs'
32
+ to 'wc/outputs'
33
+
34
+ count_uniq
35
+ total :bytes, :words, :lines
36
+
37
+ Log Analysis DSL script
38
+ use 'LogAnalysis'
39
+
40
+ data 'apache log on test2' do
41
+ from 'apachelog/inputs'
42
+ to 'apachelog/outputs'
43
+
44
+ each_line do
45
+ pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
46
+ column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
47
+
48
+ topic 'ua counts', :label => 'ua' do
49
+ count_uniq column[:ua]
50
+ end
51
+ end
52
+ end
53
+
54
+ == Author
55
+ Koichi Fujikawa <fujibee@gmail.com>
56
+
57
+ == Copyright
58
+ License: Apache License
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.0.3
data/bin/hrd ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'hadoop-dsl'
4
+
5
+ HadoopDsl::Client.new(ARGV).run
@@ -5,26 +5,27 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{hadoop-rubydsl}
8
- s.version = "0.0.2"
8
+ s.version = "0.0.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Koichi Fujikawa"]
12
- s.date = %q{2009-12-28}
12
+ s.date = %q{2010-01-04}
13
13
  s.description = %q{Hadoop Ruby DSL}
14
14
  s.email = %q{fujibee@gmail.com}
15
- s.executables = ["hadoop-hudson.sh", "hadoop-ruby.sh"]
15
+ s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
16
16
  s.extra_rdoc_files = [
17
- "README",
17
+ "README.rdoc",
18
18
  "TODO"
19
19
  ]
20
20
  s.files = [
21
21
  ".gitignore",
22
- "README",
22
+ "README.rdoc",
23
23
  "Rakefile",
24
24
  "TODO",
25
25
  "VERSION",
26
26
  "bin/hadoop-hudson.sh",
27
27
  "bin/hadoop-ruby.sh",
28
+ "bin/hrd",
28
29
  "conf/hadoop-site.xml",
29
30
  "examples/apachelog-v2-2.rb",
30
31
  "examples/apachelog-v2.rb",
@@ -32,11 +33,11 @@ Gem::Specification.new do |s|
32
33
  "examples/hive_like_test.rb",
33
34
  "examples/word_count_test.rb",
34
35
  "hadoop-rubydsl.gemspec",
36
+ "lib/client.rb",
35
37
  "lib/core.rb",
38
+ "lib/dsl_init.rb",
39
+ "lib/hadoop-dsl.rb",
36
40
  "lib/hive_like.rb",
37
- "lib/init.rb",
38
- "lib/java/.gitignore",
39
- "lib/java/hadoop-ruby.jar",
40
41
  "lib/log_analysis.rb",
41
42
  "lib/mapred_factory.rb",
42
43
  "lib/util.rb",
@@ -49,14 +50,15 @@ Gem::Specification.new do |s|
49
50
  s.summary = %q{Hadoop Ruby DSL}
50
51
  s.test_files = [
51
52
  "spec/spec_helper.rb",
53
+ "spec/dsl_init_spec.rb",
52
54
  "spec/core_spec.rb",
55
+ "spec/client_spec.rb",
53
56
  "spec/util_spec.rb",
54
57
  "spec/mapred_factory_spec.rb",
55
58
  "spec/word_count_spec.rb",
56
59
  "spec/hive_like_spec.rb",
57
60
  "spec/log_analysis_spec.rb",
58
61
  "spec/example_spec.rb",
59
- "spec/init_spec.rb",
60
62
  "examples/apachelog-v2.rb",
61
63
  "examples/hive_like_test.rb",
62
64
  "examples/word_count_test.rb",
data/lib/client.rb ADDED
@@ -0,0 +1,27 @@
1
+ module HadoopDsl
2
+ class Client < JRubyOnHadoop::Client
3
+ def parse_args
4
+ super
5
+ @script_path = HadoopDsl.dsl_init_script
6
+ @script = File.basename(@script_path)
7
+ @dsl_file_path = @args[0]
8
+ @dsl_file = File.basename(@dsl_file_path)
9
+ @files << @script_path << @dsl_file_path
10
+
11
+ # TODO move properly, with jruby-on-hadoop
12
+ add_dsl_lib_files
13
+ ENV['RUBYLIB'] = File.dirname(@dsl_file_path)
14
+ end
15
+
16
+ def mapred_args
17
+ args = super
18
+ args += " --dslfile #{@dsl_file}"
19
+ args
20
+ end
21
+
22
+ def add_dsl_lib_files
23
+ lib_path = HadoopDsl.lib_path
24
+ @files += Dir.glob(File.join(lib_path, "*.rb"))
25
+ end
26
+ end
27
+ end
@@ -12,15 +12,14 @@ HadoopDsl::Text = Text
12
12
  HadoopDsl::IntWritable = IntWritable
13
13
 
14
14
  def map(key, value, output, reporter, script)
15
- mapper = MapperFactory.create(script, key.to_string, value.to_string)
15
+ mapper = MapperFactory.create(script, key, value)
16
16
  mapper.run
17
17
 
18
18
  write(output, mapper)
19
19
  end
20
20
 
21
21
  def reduce(key, values, output, reporter, script)
22
- ruby_values = values.map {|v| to_ruby(v)}
23
- reducer = ReducerFactory.create(script, key.to_string, ruby_values)
22
+ reducer = ReducerFactory.create(script, key, values)
24
23
  reducer.run
25
24
 
26
25
  write(output, reducer)
@@ -29,8 +28,7 @@ end
29
28
  def setup(conf, script)
30
29
  setup = SetupFactory.create(script, conf)
31
30
  setup.run
32
-
33
- setup.paths.to_java
31
+ setup.paths
34
32
  end
35
33
 
36
34
  private
@@ -38,23 +36,7 @@ private
38
36
  def write(output, controller)
39
37
  controller.emitted.each do |e|
40
38
  e.each do |k, v|
41
- output.collect(to_hadoop(k), to_hadoop(v))
39
+ output.collect(k, v)
42
40
  end
43
41
  end
44
42
  end
45
-
46
- def to_ruby(value)
47
- case value
48
- when IntWritable then value.get
49
- when Text then value.to_string
50
- else raise "no match class: #{value.class}"
51
- end
52
- end
53
-
54
- def to_hadoop(value)
55
- case value
56
- when Integer then IntWritable.new(value)
57
- when String then t = Text.new; t.set(value); t
58
- else raise "no match class: #{value.class}"
59
- end
60
- end
data/lib/hadoop-dsl.rb ADDED
@@ -0,0 +1,12 @@
1
+ require 'jruby-on-hadoop'
2
+ require 'client'
3
+
4
+ module HadoopDsl
5
+ def self.lib_path
6
+ File.expand_path(File.dirname(__FILE__))
7
+ end
8
+
9
+ def self.dsl_init_script
10
+ File.join(lib_path, "dsl_init.rb")
11
+ end
12
+ end
data/lib/util.rb CHANGED
@@ -6,6 +6,17 @@ module HadoopDsl
6
6
  end
7
7
 
8
8
  def read_file(file_name)
9
- File.open(file_name).read
9
+ # read as usual
10
+ body = File.open(file_name).read rescue nil
11
+ return body if body
12
+
13
+ # read from loadpath
14
+ $:.each do |path|
15
+ p path
16
+ body = File.open(File.join(path, file_name)).read rescue next
17
+ return body if body
18
+ end
19
+
20
+ raise "cannot find file - #{file_name}"
10
21
  end
11
22
  end
@@ -0,0 +1,26 @@
1
+ require 'hadoop-dsl'
2
+
3
+ describe HadoopDsl::Client do
4
+ before do
5
+ @client = HadoopDsl::Client.new(["examples/wordcount.rb", "in", "out"])
6
+ end
7
+
8
+ it 'can parse args' do
9
+ @client.files.join.should match /ruby_wrapper\.rb/
10
+ @client.files.join.should match /dsl_init\.rb/
11
+ @client.files.should include 'examples/wordcount.rb'
12
+ @client.inputs.should == 'in'
13
+ @client.outputs.should == 'out'
14
+ end
15
+
16
+ it 'can add dsl file into mapred args' do
17
+ @client.mapred_args.should ==
18
+ "--script dsl_init.rb in out --dslfile wordcount.rb"
19
+ end
20
+
21
+ it 'can add dsl lib files' do
22
+ lib_path = HadoopDsl.lib_path
23
+ @client.files.should include File.join(lib_path, 'core.rb')
24
+ @client.files.should include File.join(lib_path, 'log_analysis.rb')
25
+ end
26
+ end
data/spec/core_spec.rb CHANGED
@@ -1,4 +1,4 @@
1
- require 'init'
1
+ require 'dsl_init'
2
2
  require 'core'
3
3
 
4
4
  include HadoopDsl
@@ -1,9 +1,4 @@
1
- require 'java'
2
- require 'init'
3
-
4
- import 'org.apache.hadoop.io.IntWritable'
5
- import 'org.apache.hadoop.io.Text'
6
- import 'org.apache.hadoop.mapred.JobConf'
1
+ require 'dsl_init'
7
2
 
8
3
  describe 'mapreduce init' do
9
4
 
@@ -24,22 +19,20 @@ end
24
19
  end
25
20
 
26
21
  before do
27
- @one = IntWritable.new(1)
22
+ @one = 1
28
23
  @output = mock('output')
29
24
  end
30
25
 
31
26
  it 'can map sucessfully' do
32
- key, value = Text.new, Text.new
33
- key.set("key")
34
- value.set('it should be fine')
27
+ key = 'key'
28
+ value = 'it should be fine'
35
29
  @output.should_receive(:collect).once #.with(@text, @one)
36
30
 
37
31
  map(key, value, @output, nil, @script)
38
32
  end
39
33
 
40
34
  it 'can reduce sucessfully' do
41
- key, value = Text.new, Text.new
42
- key.set("t1\tkey")
35
+ key = "t1\tkey"
43
36
  values = [@one, @one, @one]
44
37
  @output.should_receive(:collect).once #.with(@text, @one)
45
38
 
@@ -47,7 +40,7 @@ end
47
40
  end
48
41
 
49
42
  it 'can set job conf' do
50
- conf = JobConf.new
43
+ conf = mock('jobconf')
51
44
  paths = setup(conf, @script)
52
45
 
53
46
  paths[0].should == 'test/inputs'
@@ -1,4 +1,3 @@
1
- require 'init'
2
1
  require 'core'
3
2
  require 'hive_like'
4
3
 
@@ -1,4 +1,3 @@
1
- require 'init'
2
1
  require 'core'
3
2
  require 'log_analysis'
4
3
 
data/spec/util_spec.rb CHANGED
@@ -12,4 +12,8 @@ describe 'utilities' do
12
12
  @script = create_tmp_script(script_body)
13
13
  read_file(@script).should == script_body
14
14
  end
15
+
16
+ it 'raise error if no file in loadpath' do
17
+ lambda { read_file('not_exists_on_loadpath') }.should raise_error
18
+ end
15
19
  end
@@ -1,4 +1,3 @@
1
- require 'init'
2
1
  require 'core'
3
2
  require 'word_count'
4
3
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hadoop-rubydsl
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-28 00:00:00 +09:00
12
+ date: 2010-01-04 00:00:00 +09:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -25,21 +25,23 @@ dependencies:
25
25
  description: Hadoop Ruby DSL
26
26
  email: fujibee@gmail.com
27
27
  executables:
28
+ - hrd
28
29
  - hadoop-hudson.sh
29
30
  - hadoop-ruby.sh
30
31
  extensions: []
31
32
 
32
33
  extra_rdoc_files:
33
- - README
34
+ - README.rdoc
34
35
  - TODO
35
36
  files:
36
37
  - .gitignore
37
- - README
38
+ - README.rdoc
38
39
  - Rakefile
39
40
  - TODO
40
41
  - VERSION
41
42
  - bin/hadoop-hudson.sh
42
43
  - bin/hadoop-ruby.sh
44
+ - bin/hrd
43
45
  - conf/hadoop-site.xml
44
46
  - examples/apachelog-v2-2.rb
45
47
  - examples/apachelog-v2.rb
@@ -47,11 +49,11 @@ files:
47
49
  - examples/hive_like_test.rb
48
50
  - examples/word_count_test.rb
49
51
  - hadoop-rubydsl.gemspec
52
+ - lib/client.rb
50
53
  - lib/core.rb
54
+ - lib/dsl_init.rb
55
+ - lib/hadoop-dsl.rb
51
56
  - lib/hive_like.rb
52
- - lib/init.rb
53
- - lib/java/.gitignore
54
- - lib/java/hadoop-ruby.jar
55
57
  - lib/log_analysis.rb
56
58
  - lib/mapred_factory.rb
57
59
  - lib/util.rb
@@ -86,14 +88,15 @@ specification_version: 3
86
88
  summary: Hadoop Ruby DSL
87
89
  test_files:
88
90
  - spec/spec_helper.rb
91
+ - spec/dsl_init_spec.rb
89
92
  - spec/core_spec.rb
93
+ - spec/client_spec.rb
90
94
  - spec/util_spec.rb
91
95
  - spec/mapred_factory_spec.rb
92
96
  - spec/word_count_spec.rb
93
97
  - spec/hive_like_spec.rb
94
98
  - spec/log_analysis_spec.rb
95
99
  - spec/example_spec.rb
96
- - spec/init_spec.rb
97
100
  - examples/apachelog-v2.rb
98
101
  - examples/hive_like_test.rb
99
102
  - examples/word_count_test.rb
data/README DELETED
@@ -1,53 +0,0 @@
1
- = hadoop-rubydsl
2
-
3
- == Description
4
- HadoopのMapper/ReducerをRubyによるDSLで記述することができます。
5
- hadoop-ruby.jarを利用します。
6
-
7
- 例)
8
- apachelog.rb
9
-
10
- # log:
11
- # 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
12
- # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
13
- # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
14
-
15
- use 'LogAnalysis'
16
- data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
17
- column[2].count_uniq
18
- column[3].count_uniq
19
- column[4].count_uniq
20
- column[5].count_uniq
21
- column[6].sum
22
-
23
- =>
24
- col2 frank 1
25
- col2 frank2 2
26
- col3 [10/Oct/2000:13:55:36 -0700] 3
27
- col4 "GET /apache_pb.gif HTTP/1.0" 1
28
- col4 "GET /apache_pb2.gif HTTP/1.0" 1
29
- col4 "GET /apache_pb3.gif HTTP/1.0" 1
30
- col5 200 2
31
- col5 404 1
32
- col6 6978
33
-
34
- == Usage
35
- 0. HADOOP_HOMEを正しく設定し、Hadoopを一式立ち上げておく。
36
-
37
- 1. jruby-complete-*.jar を lib/java 以下にコピー
38
- ex)
39
- $ wget http://jruby.kenai.com/downloads/1.4.0RC2/jruby-complete-1.4.0RC2.jar
40
- $ cp jruby-complete-*.jar lib/java/
41
-
42
- 2. データを HDFS にアップロード
43
- ex)
44
- $ hadoop dfs -copyFromLocal apachelog inputs/
45
-
46
- 3. MapReduce実行
47
- $ bin/hadoop-ruby.sh examples/apachelog.rb inputs outputs
48
-
49
- == Author
50
- Koichi Fujikawa <fujibee@gmail.com>
51
-
52
- == Copyright
53
- License: Apache License
data/lib/java/.gitignore DELETED
@@ -1 +0,0 @@
1
- jruby-complete-*.jar
Binary file