jruby-on-hadoop 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,9 @@
1
1
  = JRuby on Hadoop
2
2
 
3
3
  JRuby on Hadoop is a thin wrapper for Hadoop Mapper / Reducer by JRuby.
4
+ We recommend to use this with hadoop-rubydsl on the github / gemcutter.
5
+
6
+ == Description
4
7
 
5
8
  == Install
6
9
 
@@ -10,7 +13,7 @@ Required gems are all on GemCutter.
10
13
  2. Install gems
11
14
  $ gem install jruby-on-hadoop
12
15
 
13
- == Description
16
+ == Usage
14
17
 
15
18
  1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
16
19
  2. put files into your hdfs. ex) test/inputs/file1
@@ -18,18 +21,27 @@ Required gems are all on GemCutter.
18
21
  $ joh examples/wordcount.rb test/inputs test/outputs
19
22
  You can get Hadoop job results in your hdfs test/outputs/part-*
20
23
 
21
- Script example. (see also examples/wordcount.rb)
24
+ == Example
25
+ see also examples/wordcount.rb
22
26
 
23
27
  def setup(conf)
24
28
  # setup jobconf
25
29
  end
26
30
 
27
- def map(script, key, value, output, reporter)
31
+ def map(key, value, output, reporter)
28
32
  # mapper process
33
+ # (wordcount example)
34
+ value.split.each do |word|
35
+ output.collect(word, 1)
36
+ end
29
37
  end
30
38
 
31
- def reduce(script, key, values, output, reporter)
39
+ def reduce(key, values, output, reporter)
32
40
  # reducer process
41
+ # (wordcount example)
42
+ sum = 0
43
+ values.each {|v| sum += v }
44
+ output.collect(key, sum)
33
45
  end
34
46
 
35
47
  == Build
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{jruby-on-hadoop}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Koichi Fujikawa"]
12
- s.date = %q{2010-01-03}
12
+ s.date = %q{2010-01-15}
13
13
  s.default_executable = %q{joh}
14
14
  s.description = %q{JRuby on Hadoop}
15
15
  s.email = %q{fujibee@gmail.com}
@@ -29,6 +29,7 @@ Gem::Specification.new do |s|
29
29
  "lib/jruby-on-hadoop.rb",
30
30
  "lib/jruby-on-hadoop/client.rb",
31
31
  "lib/ruby_wrapper.rb",
32
+ "spec/jruby-on-hadoop/client_spec.rb",
32
33
  "spec/jruby-on-hadoop_spec.rb",
33
34
  "spec/ruby_wrapper_spec.rb",
34
35
  "test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java",
@@ -41,7 +42,8 @@ Gem::Specification.new do |s|
41
42
  s.rubygems_version = %q{1.3.5}
42
43
  s.summary = %q{JRuby on Hadoop}
43
44
  s.test_files = [
44
- "spec/jruby-on-hadoop_spec.rb",
45
+ "spec/jruby-on-hadoop/client_spec.rb",
46
+ "spec/jruby-on-hadoop_spec.rb",
45
47
  "spec/ruby_wrapper_spec.rb",
46
48
  "examples/wordcount.rb"
47
49
  ]
Binary file
@@ -9,18 +9,29 @@ module JRubyOnHadoop
9
9
  parse_args
10
10
 
11
11
  # env check
12
- hadoop_home = ENV['HADOOP_HOME']
13
- raise 'HADOOP_HOME is not set' unless hadoop_home
14
- @hadoop_cmd = "#{hadoop_home}/bin/hadoop"
12
+ hadoop_home and hadoop_cmd
15
13
  ENV['HADOOP_CLASSPATH'] = "#{lib_path}:#{File.dirname(@script_path)}"
16
14
  end
17
15
 
16
+ def hadoop_home
17
+ home = ENV['HADOOP_HOME']
18
+ raise 'HADOOP_HOME is not set' if home.nil? or home.empty?
19
+ home
20
+ end
21
+
22
+ def hadoop_cmd
23
+ hadoop = `which hadoop 2>/dev/null`
24
+ hadoop = "#{hadoop_home}/bin/hadoop" if hadoop.nil? or hadoop.empty?
25
+ raise 'cannot find hadoop command' unless hadoop
26
+ hadoop.chomp
27
+ end
28
+
18
29
  def run
19
30
  exec cmd
20
31
  end
21
32
 
22
33
  def cmd
23
- "#{@hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
34
+ "#{hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
24
35
  " -libjars #{opt_libjars} -files #{opt_files} #{mapred_args}"
25
36
  end
26
37
 
@@ -0,0 +1,79 @@
1
+ require 'jruby-on-hadoop'
2
+
3
+ describe JRubyOnHadoop::Client do
4
+ before do
5
+ @client = JRubyOnHadoop::Client.new
6
+ end
7
+
8
+ it 'gather necessary jar paths' do
9
+ version_pattern = '[\d\.]*'
10
+ @client.main_jar_path.should include 'hadoop-ruby.jar'
11
+
12
+ @client.opt_libjars.should match /jruby\-core\-#{version_pattern}\.jar/
13
+ @client.opt_libjars.should match /jruby\-stdlib\-#{version_pattern}\.jar/
14
+ end
15
+
16
+ it 'gather necessary ruby files' do
17
+ @client.opt_files.split(",").should include "mapred.rb"
18
+ @client.opt_files.should match /ruby_wrapper\.rb/
19
+ end
20
+
21
+ it 'construct command for running hadoop' do
22
+ path_pattern = '[\w/\-\.,]*'
23
+ @client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
24
+ end
25
+
26
+ it 'can get mapred args' do
27
+ client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "inputs", "outputs"])
28
+ client.mapred_args.should == "--script mapred.rb inputs outputs"
29
+ end
30
+
31
+ it 'can parse args' do
32
+ client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "in", "out"])
33
+ client.script.should == 'mapred.rb'
34
+ client.inputs.should == 'in'
35
+ client.outputs.should == 'out'
36
+ client.files.should include 'examples/mapred.rb'
37
+ end
38
+
39
+ it 'should raise error if HADOOP_HOME env is not set' do
40
+ saved = ENV['HADOOP_HOME']
41
+ ENV['HADOOP_HOME'] = ''
42
+ begin
43
+ lambda { JRubyOnHadoop::Client.new }.should raise_error
44
+ ensure
45
+ ENV['HADOOP_HOME'] = saved
46
+ end
47
+ end
48
+
49
+ it 'can determin bin/hadoop path' do
50
+ @client.hadoop_cmd.should match /hadoop$/
51
+ end
52
+
53
+ it 'can determin bin/hadoop path if even no in PATH env var' do
54
+ saved = ENV['PATH']
55
+ begin
56
+ ENV['PATH'] = ''
57
+ ENV['HADOOP_HOME'].should_not be_empty
58
+ client = JRubyOnHadoop::Client.new
59
+ client.hadoop_cmd.should match ENV['HADOOP_HOME']
60
+ client.hadoop_cmd.should match /hadoop$/
61
+ ensure
62
+ ENV['PATH'] = saved
63
+ end
64
+ end
65
+
66
+ it 'should raise error if cannot determin bin/hadoop path' do
67
+ saved_path = ENV['PATH']
68
+ saved_home = ENV['HADOOP_HOME']
69
+ begin
70
+ ENV['PATH'] = ''
71
+ lambda { JRubyOnHadoop::Client.new }.should_not raise_error
72
+ ENV['HADOOP_HOME'] = ''
73
+ lambda { JRubyOnHadoop::Client.new }.should raise_error
74
+ ensure
75
+ ENV['PATH'] = saved_path
76
+ ENV['HADOOP_HOME'] = saved_home
77
+ end
78
+ end
79
+ end
@@ -18,40 +18,3 @@ describe JRubyOnHadoop do
18
18
  JRubyOnHadoop.wrapper_ruby_file.should == path
19
19
  end
20
20
  end
21
-
22
- describe JRubyOnHadoop::Client do
23
- before do
24
- @client = JRubyOnHadoop::Client.new
25
- end
26
-
27
- it 'gather necessary jar paths' do
28
- version_pattern = '[\d\.]*'
29
- @client.main_jar_path.should include 'hadoop-ruby.jar'
30
-
31
- @client.opt_libjars.should match /jruby\-core\-#{version_pattern}\.jar/
32
- @client.opt_libjars.should match /jruby\-stdlib\-#{version_pattern}\.jar/
33
- end
34
-
35
- it 'gather necessary ruby files' do
36
- @client.opt_files.split(",").should include "mapred.rb"
37
- @client.opt_files.should match /ruby_wrapper\.rb/
38
- end
39
-
40
- it 'construct command for running hadoop' do
41
- path_pattern = '[\w/\-\.,]*'
42
- @client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
43
- end
44
-
45
- it 'can get mapred args' do
46
- client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "inputs", "outputs"])
47
- client.mapred_args.should == "--script mapred.rb inputs outputs"
48
- end
49
-
50
- it 'can parse args' do
51
- client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "in", "out"])
52
- client.script.should == 'mapred.rb'
53
- client.inputs.should == 'in'
54
- client.outputs.should == 'out'
55
- client.files.should include 'examples/mapred.rb'
56
- end
57
- end
@@ -5,7 +5,7 @@ describe 'wrapper' do
5
5
  examples_dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'examples'))
6
6
  $: << examples_dir
7
7
 
8
- @script = 'mapred.rb'
8
+ @script = 'wordcount.rb'
9
9
  @output, @repoter = mock('output'), mock('repoter')
10
10
 
11
11
  @key, @value = Text.new, Text.new
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jruby-on-hadoop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-03 00:00:00 +09:00
12
+ date: 2010-01-15 00:00:00 +09:00
13
13
  default_executable: joh
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -42,6 +42,7 @@ files:
42
42
  - lib/jruby-on-hadoop.rb
43
43
  - lib/jruby-on-hadoop/client.rb
44
44
  - lib/ruby_wrapper.rb
45
+ - spec/jruby-on-hadoop/client_spec.rb
45
46
  - spec/jruby-on-hadoop_spec.rb
46
47
  - spec/ruby_wrapper_spec.rb
47
48
  - test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java
@@ -76,6 +77,7 @@ signing_key:
76
77
  specification_version: 3
77
78
  summary: JRuby on Hadoop
78
79
  test_files:
80
+ - spec/jruby-on-hadoop/client_spec.rb
79
81
  - spec/jruby-on-hadoop_spec.rb
80
82
  - spec/ruby_wrapper_spec.rb
81
83
  - examples/wordcount.rb