jruby-on-hadoop 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,9 @@
1
1
  = JRuby on Hadoop
2
2
 
3
3
  JRuby on Hadoop is a thin wrapper for Hadoop Mapper / Reducer by JRuby.
4
+ We recommend to use this with hadoop-rubydsl on the github / gemcutter.
5
+
6
+ == Description
4
7
 
5
8
  == Install
6
9
 
@@ -10,7 +13,7 @@ Required gems are all on GemCutter.
10
13
  2. Install gems
11
14
  $ gem install jruby-on-hadoop
12
15
 
13
- == Description
16
+ == Usage
14
17
 
15
18
  1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
16
19
  2. put files into your hdfs. ex) test/inputs/file1
@@ -18,18 +21,27 @@ Required gems are all on GemCutter.
18
21
  $ joh examples/wordcount.rb test/inputs test/outputs
19
22
  You can get Hadoop job results in your hdfs test/outputs/part-*
20
23
 
21
- Script example. (see also examples/wordcount.rb)
24
+ == Example
25
+ see also examples/wordcount.rb
22
26
 
23
27
  def setup(conf)
24
28
  # setup jobconf
25
29
  end
26
30
 
27
- def map(script, key, value, output, reporter)
31
+ def map(key, value, output, reporter)
28
32
  # mapper process
33
+ # (wordcount example)
34
+ value.split.each do |word|
35
+ output.collect(word, 1)
36
+ end
29
37
  end
30
38
 
31
- def reduce(script, key, values, output, reporter)
39
+ def reduce(key, values, output, reporter)
32
40
  # reducer process
41
+ # (wordcount example)
42
+ sum = 0
43
+ values.each {|v| sum += v }
44
+ output.collect(key, sum)
33
45
  end
34
46
 
35
47
  == Build
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.3
1
+ 0.0.4
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{jruby-on-hadoop}
8
- s.version = "0.0.3"
8
+ s.version = "0.0.4"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Koichi Fujikawa"]
12
- s.date = %q{2010-01-03}
12
+ s.date = %q{2010-01-15}
13
13
  s.default_executable = %q{joh}
14
14
  s.description = %q{JRuby on Hadoop}
15
15
  s.email = %q{fujibee@gmail.com}
@@ -29,6 +29,7 @@ Gem::Specification.new do |s|
29
29
  "lib/jruby-on-hadoop.rb",
30
30
  "lib/jruby-on-hadoop/client.rb",
31
31
  "lib/ruby_wrapper.rb",
32
+ "spec/jruby-on-hadoop/client_spec.rb",
32
33
  "spec/jruby-on-hadoop_spec.rb",
33
34
  "spec/ruby_wrapper_spec.rb",
34
35
  "test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java",
@@ -41,7 +42,8 @@ Gem::Specification.new do |s|
41
42
  s.rubygems_version = %q{1.3.5}
42
43
  s.summary = %q{JRuby on Hadoop}
43
44
  s.test_files = [
44
- "spec/jruby-on-hadoop_spec.rb",
45
+ "spec/jruby-on-hadoop/client_spec.rb",
46
+ "spec/jruby-on-hadoop_spec.rb",
45
47
  "spec/ruby_wrapper_spec.rb",
46
48
  "examples/wordcount.rb"
47
49
  ]
Binary file
@@ -9,18 +9,29 @@ module JRubyOnHadoop
9
9
  parse_args
10
10
 
11
11
  # env check
12
- hadoop_home = ENV['HADOOP_HOME']
13
- raise 'HADOOP_HOME is not set' unless hadoop_home
14
- @hadoop_cmd = "#{hadoop_home}/bin/hadoop"
12
+ hadoop_home and hadoop_cmd
15
13
  ENV['HADOOP_CLASSPATH'] = "#{lib_path}:#{File.dirname(@script_path)}"
16
14
  end
17
15
 
16
+ def hadoop_home
17
+ home = ENV['HADOOP_HOME']
18
+ raise 'HADOOP_HOME is not set' if home.nil? or home.empty?
19
+ home
20
+ end
21
+
22
+ def hadoop_cmd
23
+ hadoop = `which hadoop 2>/dev/null`
24
+ hadoop = "#{hadoop_home}/bin/hadoop" if hadoop.nil? or hadoop.empty?
25
+ raise 'cannot find hadoop command' unless hadoop
26
+ hadoop.chomp
27
+ end
28
+
18
29
  def run
19
30
  exec cmd
20
31
  end
21
32
 
22
33
  def cmd
23
- "#{@hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
34
+ "#{hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
24
35
  " -libjars #{opt_libjars} -files #{opt_files} #{mapred_args}"
25
36
  end
26
37
 
@@ -0,0 +1,79 @@
1
+ require 'jruby-on-hadoop'
2
+
3
+ describe JRubyOnHadoop::Client do
4
+ before do
5
+ @client = JRubyOnHadoop::Client.new
6
+ end
7
+
8
+ it 'gather necessary jar paths' do
9
+ version_pattern = '[\d\.]*'
10
+ @client.main_jar_path.should include 'hadoop-ruby.jar'
11
+
12
+ @client.opt_libjars.should match /jruby\-core\-#{version_pattern}\.jar/
13
+ @client.opt_libjars.should match /jruby\-stdlib\-#{version_pattern}\.jar/
14
+ end
15
+
16
+ it 'gather necessary ruby files' do
17
+ @client.opt_files.split(",").should include "mapred.rb"
18
+ @client.opt_files.should match /ruby_wrapper\.rb/
19
+ end
20
+
21
+ it 'construct command for running hadoop' do
22
+ path_pattern = '[\w/\-\.,]*'
23
+ @client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
24
+ end
25
+
26
+ it 'can get mapred args' do
27
+ client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "inputs", "outputs"])
28
+ client.mapred_args.should == "--script mapred.rb inputs outputs"
29
+ end
30
+
31
+ it 'can parse args' do
32
+ client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "in", "out"])
33
+ client.script.should == 'mapred.rb'
34
+ client.inputs.should == 'in'
35
+ client.outputs.should == 'out'
36
+ client.files.should include 'examples/mapred.rb'
37
+ end
38
+
39
+ it 'should raise error if HADOOP_HOME env is not set' do
40
+ saved = ENV['HADOOP_HOME']
41
+ ENV['HADOOP_HOME'] = ''
42
+ begin
43
+ lambda { JRubyOnHadoop::Client.new }.should raise_error
44
+ ensure
45
+ ENV['HADOOP_HOME'] = saved
46
+ end
47
+ end
48
+
49
+ it 'can determin bin/hadoop path' do
50
+ @client.hadoop_cmd.should match /hadoop$/
51
+ end
52
+
53
+ it 'can determin bin/hadoop path if even no in PATH env var' do
54
+ saved = ENV['PATH']
55
+ begin
56
+ ENV['PATH'] = ''
57
+ ENV['HADOOP_HOME'].should_not be_empty
58
+ client = JRubyOnHadoop::Client.new
59
+ client.hadoop_cmd.should match ENV['HADOOP_HOME']
60
+ client.hadoop_cmd.should match /hadoop$/
61
+ ensure
62
+ ENV['PATH'] = saved
63
+ end
64
+ end
65
+
66
+ it 'should raise error if cannot determin bin/hadoop path' do
67
+ saved_path = ENV['PATH']
68
+ saved_home = ENV['HADOOP_HOME']
69
+ begin
70
+ ENV['PATH'] = ''
71
+ lambda { JRubyOnHadoop::Client.new }.should_not raise_error
72
+ ENV['HADOOP_HOME'] = ''
73
+ lambda { JRubyOnHadoop::Client.new }.should raise_error
74
+ ensure
75
+ ENV['PATH'] = saved_path
76
+ ENV['HADOOP_HOME'] = saved_home
77
+ end
78
+ end
79
+ end
@@ -18,40 +18,3 @@ describe JRubyOnHadoop do
18
18
  JRubyOnHadoop.wrapper_ruby_file.should == path
19
19
  end
20
20
  end
21
-
22
- describe JRubyOnHadoop::Client do
23
- before do
24
- @client = JRubyOnHadoop::Client.new
25
- end
26
-
27
- it 'gather necessary jar paths' do
28
- version_pattern = '[\d\.]*'
29
- @client.main_jar_path.should include 'hadoop-ruby.jar'
30
-
31
- @client.opt_libjars.should match /jruby\-core\-#{version_pattern}\.jar/
32
- @client.opt_libjars.should match /jruby\-stdlib\-#{version_pattern}\.jar/
33
- end
34
-
35
- it 'gather necessary ruby files' do
36
- @client.opt_files.split(",").should include "mapred.rb"
37
- @client.opt_files.should match /ruby_wrapper\.rb/
38
- end
39
-
40
- it 'construct command for running hadoop' do
41
- path_pattern = '[\w/\-\.,]*'
42
- @client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
43
- end
44
-
45
- it 'can get mapred args' do
46
- client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "inputs", "outputs"])
47
- client.mapred_args.should == "--script mapred.rb inputs outputs"
48
- end
49
-
50
- it 'can parse args' do
51
- client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "in", "out"])
52
- client.script.should == 'mapred.rb'
53
- client.inputs.should == 'in'
54
- client.outputs.should == 'out'
55
- client.files.should include 'examples/mapred.rb'
56
- end
57
- end
@@ -5,7 +5,7 @@ describe 'wrapper' do
5
5
  examples_dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'examples'))
6
6
  $: << examples_dir
7
7
 
8
- @script = 'mapred.rb'
8
+ @script = 'wordcount.rb'
9
9
  @output, @repoter = mock('output'), mock('repoter')
10
10
 
11
11
  @key, @value = Text.new, Text.new
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jruby-on-hadoop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-03 00:00:00 +09:00
12
+ date: 2010-01-15 00:00:00 +09:00
13
13
  default_executable: joh
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -42,6 +42,7 @@ files:
42
42
  - lib/jruby-on-hadoop.rb
43
43
  - lib/jruby-on-hadoop/client.rb
44
44
  - lib/ruby_wrapper.rb
45
+ - spec/jruby-on-hadoop/client_spec.rb
45
46
  - spec/jruby-on-hadoop_spec.rb
46
47
  - spec/ruby_wrapper_spec.rb
47
48
  - test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java
@@ -76,6 +77,7 @@ signing_key:
76
77
  specification_version: 3
77
78
  summary: JRuby on Hadoop
78
79
  test_files:
80
+ - spec/jruby-on-hadoop/client_spec.rb
79
81
  - spec/jruby-on-hadoop_spec.rb
80
82
  - spec/ruby_wrapper_spec.rb
81
83
  - examples/wordcount.rb