jruby-on-hadoop 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc ADDED
@@ -0,0 +1,47 @@
1
+ = JRuby on Hadoop
2
+
3
+ JRuby on Hadoop is a thin wrapper for Hadoop Mapper / Reducer by JRuby.
4
+
5
+ == Install
6
+
7
+ Required gems are all on GemCutter.
8
+
9
+ 1. Upgrade your rubygem to 1.3.5
10
+ 2. Install gems
11
+ $ gem install jruby-on-hadoop
12
+
13
+ == Description
14
+
15
+ 1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
16
+ 2. put files into your hdfs. ex) test/inputs/file1
17
+ 3. Now you can run 'joh' like below:
18
+ $ joh examples/wordcount.rb test/inputs test/outputs
19
+ You can get Hadoop job results in your hdfs test/outputs/part-*
20
+
21
+ Script example. (see also examples/wordcount.rb)
22
+
23
+ def setup(conf)
24
+ # setup jobconf
25
+ end
26
+
27
+ def map(script, key, value, output, reporter)
28
+ # mapper process
29
+ end
30
+
31
+ def reduce(script, key, values, output, reporter)
32
+ # reducer process
33
+ end
34
+
35
+ == Build
36
+
37
+ You can build hadoop-ruby.jar by "ant".
38
+ ant
39
+
40
+ Required to set env HADOOP_HOME for your system.
41
+ Assumed Hadoop version is 0.19.2.
42
+
43
+ == Author
44
+ Koichi Fujikawa <fujibee@gmail.com>
45
+
46
+ == Copyright
47
+ License: Apache License
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.0.3
data/build.xml CHANGED
@@ -9,10 +9,10 @@
9
9
  ====================================================================== -->
10
10
  <project name="build JRuby map reduce" default="jar">
11
11
 
12
- <property environment="env" />
13
- <property name="hadoop.home" value="${env.HADOOP_HOME}" />
14
-
15
- <property name="version" value="0.0.1" />
12
+ <property environment="env" />
13
+ <property name="hadoop.home" value="${env.HADOOP_HOME}" />
14
+
15
+ <property name="version" value="0.0.2" />
16
16
  <property name="src.dir" value="${basedir}/src/java" />
17
17
  <property name="build.dir" value="${basedir}/build" />
18
18
  <property name="dist.dir" value="${basedir}/dist" />
@@ -20,7 +20,7 @@
20
20
  <property name="hadoop.version" value="0.19.2" />
21
21
  <property name="hadoop.jar" value="${hadoop.home}/hadoop-${hadoop.version}-core.jar" />
22
22
 
23
- <target name="jar" depends="compile">
23
+ <target name="jar" depends="clean,compile">
24
24
  <mkdir dir="${dist.dir}" />
25
25
  <jar jarfile="${dist.dir}/hadoop-ruby.jar" basedir="${build.dir}">
26
26
  <manifest>
@@ -33,10 +33,22 @@
33
33
  </jar>
34
34
  </target>
35
35
 
36
- <target name="compile">
37
- <echo message="${hadoop.jar}" />
36
+ <target name="compile">
37
+ <echo message="${hadoop.jar}" />
38
38
  <mkdir dir="${build.dir}" />
39
- <javac srcdir="${src.dir}" destdir="${build.dir}" classpath="${hadoop.jar}" />
40
- </target>
39
+ <javac srcdir="${src.dir}" destdir="${build.dir}">
40
+ <classpath>
41
+ <fileset dir="${hadoop.home}">
42
+ <include name="*.jar" />
43
+ <include name="lib/*.jar" />
44
+ </fileset>
45
+ </classpath>
46
+ </javac>
47
+ </target>
48
+
49
+ <target name="clean">
50
+ <delete dir="${build.dir}" />
51
+ <delete dir="${dist.dir}" />
52
+ </target>
41
53
 
42
54
  </project>
@@ -0,0 +1,18 @@
1
+ # wordcount example
2
+
3
+ # not necessary
4
+ def setup(conf)
5
+ ['test/inputs', 'test/outputs']
6
+ end
7
+
8
+ def map(key, value, output, reporter)
9
+ value.split.each do |word|
10
+ output.collect(word, 1)
11
+ end
12
+ end
13
+
14
+ def reduce(key, values, output, reporter)
15
+ sum = 0
16
+ values.each {|v| sum += v }
17
+ output.collect(key, sum)
18
+ end
@@ -5,28 +5,35 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{jruby-on-hadoop}
8
- s.version = "0.0.2"
8
+ s.version = "0.0.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Koichi Fujikawa"]
12
- s.date = %q{2009-12-28}
12
+ s.date = %q{2010-01-03}
13
13
  s.default_executable = %q{joh}
14
14
  s.description = %q{JRuby on Hadoop}
15
15
  s.email = %q{fujibee@gmail.com}
16
16
  s.executables = ["joh"]
17
17
  s.extra_rdoc_files = [
18
- "README"
18
+ "README.rdoc"
19
19
  ]
20
20
  s.files = [
21
- "README",
21
+ "README.rdoc",
22
22
  "Rakefile",
23
23
  "VERSION",
24
+ "bin/joh",
24
25
  "build.xml",
26
+ "examples/wordcount.rb",
25
27
  "jruby-on-hadoop.gemspec",
26
28
  "lib/hadoop-ruby.jar",
27
29
  "lib/jruby-on-hadoop.rb",
28
30
  "lib/jruby-on-hadoop/client.rb",
29
- "spec/jruby-on-hadoop_spec.rb"
31
+ "lib/ruby_wrapper.rb",
32
+ "spec/jruby-on-hadoop_spec.rb",
33
+ "spec/ruby_wrapper_spec.rb",
34
+ "test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java",
35
+ "test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java",
36
+ "test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java"
30
37
  ]
31
38
  s.homepage = %q{http://github.com/fujibee/jruby-on-hadoop}
32
39
  s.rdoc_options = ["--charset=UTF-8"]
@@ -34,7 +41,9 @@ Gem::Specification.new do |s|
34
41
  s.rubygems_version = %q{1.3.5}
35
42
  s.summary = %q{JRuby on Hadoop}
36
43
  s.test_files = [
37
- "spec/jruby-on-hadoop_spec.rb"
44
+ "spec/jruby-on-hadoop_spec.rb",
45
+ "spec/ruby_wrapper_spec.rb",
46
+ "examples/wordcount.rb"
38
47
  ]
39
48
 
40
49
  if s.respond_to? :specification_version then
data/lib/hadoop-ruby.jar CHANGED
Binary file
@@ -2,7 +2,16 @@ require 'jruby-jars'
2
2
  require 'jruby-on-hadoop/client'
3
3
 
4
4
  module JRubyOnHadoop
5
+
6
+ def self.lib_path
7
+ File.expand_path(File.dirname(__FILE__))
8
+ end
9
+
5
10
  def self.jar_path
6
- File.join(File.expand_path(File.dirname(__FILE__)), "hadoop-ruby.jar")
11
+ File.join(lib_path, "hadoop-ruby.jar")
12
+ end
13
+
14
+ def self.wrapper_ruby_file
15
+ File.join(lib_path, "ruby_wrapper.rb")
7
16
  end
8
17
  end
@@ -2,9 +2,17 @@ module JRubyOnHadoop
2
2
  JAVA_MAIN_CLASS = 'org.apache.hadoop.ruby.JRubyJobRunner'
3
3
 
4
4
  class Client
5
- def initialize(argv=[])
6
- @init_script = argv[0] || 'mapred.rb'
7
- @args = argv[1..argv.size-1].join(" ") if argv.size > 0
5
+ attr_reader :script, :inputs, :outputs, :files
6
+
7
+ def initialize(args=[])
8
+ @args = args
9
+ parse_args
10
+
11
+ # env check
12
+ hadoop_home = ENV['HADOOP_HOME']
13
+ raise 'HADOOP_HOME is not set' unless hadoop_home
14
+ @hadoop_cmd = "#{hadoop_home}/bin/hadoop"
15
+ ENV['HADOOP_CLASSPATH'] = "#{lib_path}:#{File.dirname(@script_path)}"
8
16
  end
9
17
 
10
18
  def run
@@ -12,16 +20,40 @@ module JRubyOnHadoop
12
20
  end
13
21
 
14
22
  def cmd
15
- "hadoop jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
16
- " -libjars #{jruby_jar_paths} -files #{@init_script} #{@args}"
23
+ "#{@hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
24
+ " -libjars #{opt_libjars} -files #{opt_files} #{mapred_args}"
25
+ end
26
+
27
+ def parse_args
28
+ @script_path = @args.size > 0 ? @args[0] : 'mapred.rb'
29
+ @script = File.basename(@script_path)
30
+ @inputs = @args[1] if @args.size == 3
31
+ @outputs = @args[2] if @args.size == 3
32
+ @files = [@script_path, JRubyOnHadoop.wrapper_ruby_file]
33
+ end
34
+
35
+ def mapred_args
36
+ args = "--script #{@script} "
37
+ args += "#{@inputs} " if @inputs
38
+ args += "#{@outputs}" if @outputs
39
+ args
40
+ end
41
+
42
+ def opt_libjars
43
+ # jruby jars
44
+ [JRubyJars.core_jar_path, JRubyJars.stdlib_jar_path].join(',')
45
+ end
46
+
47
+ def opt_files
48
+ @files.join(',')
17
49
  end
18
50
 
19
51
  def main_jar_path
20
52
  JRubyOnHadoop.jar_path
21
53
  end
22
54
 
23
- def jruby_jar_paths
24
- [JRubyJars.core_jar_path, JRubyJars.stdlib_jar_path].join(',')
55
+ def lib_path
56
+ JRubyOnHadoop.lib_path
25
57
  end
26
58
  end
27
59
  end
@@ -0,0 +1,59 @@
1
+ require 'java'
2
+
3
+ import 'org.apache.hadoop.io.IntWritable'
4
+ import 'org.apache.hadoop.io.LongWritable'
5
+ import 'org.apache.hadoop.io.Text'
6
+
7
+ def wrap_setup(conf, script, dslfile)
8
+ require script
9
+ paths = dslfile ? setup(conf, dslfile) : setup(conf)
10
+ paths.to_java if paths
11
+ end
12
+
13
+ def wrap_map(key, value, output, reporter, script, dslfile)
14
+ require script
15
+ output_wrapper = OutputWrapper.new(output)
16
+ dslfile ?
17
+ map(to_ruby(key), to_ruby(value), output_wrapper, reporter, dslfile) :
18
+ map(to_ruby(key), to_ruby(value), output_wrapper, reporter)
19
+ end
20
+
21
+ def wrap_reduce(key, values, output, reporter, script, dslfile)
22
+ require script
23
+ output_wrapper = OutputWrapper.new(output)
24
+ dslfile ?
25
+ reduce(to_ruby(key), to_ruby(values), output_wrapper, reporter, dslfile) :
26
+ reduce(to_ruby(key), to_ruby(values), output_wrapper, reporter)
27
+ end
28
+
29
+ class OutputWrapper
30
+ def initialize(output)
31
+ @output = output
32
+ end
33
+
34
+ def collect(key, value)
35
+ @output.collect(to_java(key), to_java(value))
36
+ end
37
+ end
38
+
39
+ def to_ruby(value)
40
+ case value
41
+ when IntWritable, LongWritable then value.get
42
+ when Text then value.to_string
43
+ else
44
+ # for Java array
45
+ if value.respond_to? :map
46
+ value.map {|v| to_ruby(v)}
47
+ else value # as is
48
+ end
49
+ end
50
+ end
51
+
52
+ def to_java(value)
53
+ case value
54
+ when Integer then IntWritable.new(value)
55
+ when String then t = Text.new; t.set(value); t
56
+ when Array then value.to_java
57
+ else raise "no match class: #{value.class}"
58
+ end
59
+ end
@@ -6,21 +6,52 @@ describe JRubyOnHadoop do
6
6
  jar_path = File.join(File.expand_path(jar_dir), 'hadoop-ruby.jar')
7
7
  JRubyOnHadoop.jar_path.should == jar_path
8
8
  end
9
+
10
+ it 'should return lib path' do
11
+ lib_dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
12
+ JRubyOnHadoop.lib_path.should == lib_dir
13
+ end
14
+
15
+ it 'should return wrapper ruby file' do
16
+ dir = File.join(File.dirname(__FILE__), '..', 'lib')
17
+ path = File.join(File.expand_path(dir), 'ruby_wrapper.rb')
18
+ JRubyOnHadoop.wrapper_ruby_file.should == path
19
+ end
9
20
  end
10
21
 
11
22
  describe JRubyOnHadoop::Client do
23
+ before do
24
+ @client = JRubyOnHadoop::Client.new
25
+ end
26
+
12
27
  it 'gather necessary jar paths' do
13
28
  version_pattern = '[\d\.]*'
14
- client = JRubyOnHadoop::Client.new
15
- client.main_jar_path.should include 'hadoop-ruby.jar'
29
+ @client.main_jar_path.should include 'hadoop-ruby.jar'
16
30
 
17
- client.jruby_jar_paths.should match /jruby\-core\-#{version_pattern}\.jar/
18
- client.jruby_jar_paths.should match /jruby\-stdlib\-#{version_pattern}\.jar/
31
+ @client.opt_libjars.should match /jruby\-core\-#{version_pattern}\.jar/
32
+ @client.opt_libjars.should match /jruby\-stdlib\-#{version_pattern}\.jar/
33
+ end
34
+
35
+ it 'gather necessary ruby files' do
36
+ @client.opt_files.split(",").should include "mapred.rb"
37
+ @client.opt_files.should match /ruby_wrapper\.rb/
19
38
  end
20
39
 
21
40
  it 'construct command for running hadoop' do
22
41
  path_pattern = '[\w/\-\.,]*'
23
- client = JRubyOnHadoop::Client.new
24
- client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
42
+ @client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
43
+ end
44
+
45
+ it 'can get mapred args' do
46
+ client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "inputs", "outputs"])
47
+ client.mapred_args.should == "--script mapred.rb inputs outputs"
48
+ end
49
+
50
+ it 'can parse args' do
51
+ client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "in", "out"])
52
+ client.script.should == 'mapred.rb'
53
+ client.inputs.should == 'in'
54
+ client.outputs.should == 'out'
55
+ client.files.should include 'examples/mapred.rb'
25
56
  end
26
57
  end
@@ -0,0 +1,30 @@
1
+ require 'ruby_wrapper'
2
+
3
+ describe 'wrapper' do
4
+ before do
5
+ examples_dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'examples'))
6
+ $: << examples_dir
7
+
8
+ @script = 'mapred.rb'
9
+ @output, @repoter = mock('output'), mock('repoter')
10
+
11
+ @key, @value = Text.new, Text.new
12
+ @key.set('key')
13
+ @value.set('value')
14
+ end
15
+
16
+ it 'can wrap setup' do
17
+ wrap_setup(mock('conf'), @script, nil)
18
+ end
19
+
20
+ it 'can wrap mapper' do
21
+ @output.should_receive(:collect).once
22
+ wrap_map(@key, @value, @output, @reporter, @script, nil)
23
+ end
24
+
25
+ it 'can wrap reducer' do
26
+ @output.should_receive(:collect).once
27
+ values = [1, 2, 3].map {|v| IntWritable.new(v)}.to_java
28
+ wrap_reduce(@key, values, @output, @reporter, @script, nil)
29
+ end
30
+ end
@@ -0,0 +1,18 @@
1
+ package org.apache.hadoop.ruby;
2
+
3
+ import org.apache.hadoop.conf.Configuration;
4
+ import org.junit.Test;
5
+
6
+ public class JRubyJobRunnerTest {
7
+
8
+ @Test
9
+ public void testRun() throws Exception {
10
+ JRubyJobRunner runner = new JRubyJobRunner();
11
+ Configuration conf = new Configuration();
12
+ runner.setConf(conf);
13
+ String[] args = { "--script", "mapred.rb", "inputs", "outputs" };
14
+ try {
15
+ runner.run(args);
16
+ } catch (Throwable t) { /* ignore */ }
17
+ }
18
+ }
@@ -0,0 +1,31 @@
1
+ package org.apache.hadoop.ruby.mapred;
2
+
3
+ import java.io.IOException;
4
+
5
+ import org.apache.hadoop.io.LongWritable;
6
+ import org.apache.hadoop.io.Text;
7
+ import org.apache.hadoop.mapred.JobConf;
8
+ import org.junit.Test;
9
+
10
+ public class JRubyMapperTest {
11
+
12
+ @Test
13
+ public void testMap() throws IOException {
14
+ LongWritable key = new LongWritable(0L);
15
+ Text value = new Text();
16
+ value.set("value");
17
+
18
+ JRubyMapper mapper = new JRubyMapper();
19
+ JobConf conf = new JobConf();
20
+ conf.set("mapred.ruby.script", "mapred.rb");
21
+ mapper.configure(conf);
22
+
23
+ try {
24
+ mapper.map(key, value, null, null);
25
+ } catch (Throwable t) {
26
+ // ignore
27
+ // TODO mock check
28
+ }
29
+ }
30
+
31
+ }
@@ -0,0 +1,36 @@
1
+ package org.apache.hadoop.ruby.mapred;
2
+
3
+ import java.io.IOException;
4
+ import java.util.ArrayList;
5
+ import java.util.List;
6
+
7
+ import org.apache.hadoop.io.IntWritable;
8
+ import org.apache.hadoop.io.Text;
9
+ import org.apache.hadoop.mapred.JobConf;
10
+ import org.junit.Test;
11
+
12
+ public class JRubyReducerTest {
13
+
14
+ @Test
15
+ public void testReduce() throws IOException {
16
+ Text key = new Text();
17
+ key.set("key");
18
+ List<IntWritable> values = new ArrayList<IntWritable>();
19
+ values.add(new IntWritable(1));
20
+ values.add(new IntWritable(2));
21
+ values.add(new IntWritable(3));
22
+
23
+ JRubyReducer reducer = new JRubyReducer();
24
+ JobConf conf = new JobConf();
25
+ conf.set("mapred.ruby.script", "mapred.rb");
26
+ reducer.configure(conf);
27
+
28
+ try {
29
+ reducer.reduce(key, values.iterator(), null, null);
30
+ } catch (Throwable t) {
31
+ // ignore
32
+ // TODO mock check
33
+ }
34
+ }
35
+
36
+ }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jruby-on-hadoop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-28 00:00:00 +09:00
12
+ date: 2010-01-03 00:00:00 +09:00
13
13
  default_executable: joh
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -29,17 +29,24 @@ executables:
29
29
  extensions: []
30
30
 
31
31
  extra_rdoc_files:
32
- - README
32
+ - README.rdoc
33
33
  files:
34
- - README
34
+ - README.rdoc
35
35
  - Rakefile
36
36
  - VERSION
37
+ - bin/joh
37
38
  - build.xml
39
+ - examples/wordcount.rb
38
40
  - jruby-on-hadoop.gemspec
39
41
  - lib/hadoop-ruby.jar
40
42
  - lib/jruby-on-hadoop.rb
41
43
  - lib/jruby-on-hadoop/client.rb
44
+ - lib/ruby_wrapper.rb
42
45
  - spec/jruby-on-hadoop_spec.rb
46
+ - spec/ruby_wrapper_spec.rb
47
+ - test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java
48
+ - test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java
49
+ - test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java
43
50
  has_rdoc: true
44
51
  homepage: http://github.com/fujibee/jruby-on-hadoop
45
52
  licenses: []
@@ -70,3 +77,5 @@ specification_version: 3
70
77
  summary: JRuby on Hadoop
71
78
  test_files:
72
79
  - spec/jruby-on-hadoop_spec.rb
80
+ - spec/ruby_wrapper_spec.rb
81
+ - examples/wordcount.rb
data/README DELETED
@@ -1,30 +0,0 @@
1
- = hadoop-ruby
2
-
3
- == Description
4
- HadoopのMapper/ReducerをRubyで記述することができます。
5
- hadoop-rubydslのためのJRubyラッパーです。
6
-
7
- 例)
8
- init.rb
9
-
10
- def map(script, key, value, output, reporter)
11
- # map処理
12
- end
13
-
14
- def reduce(script, key, values, output, reporter)
15
- # reduce処理
16
- end
17
-
18
- == Build
19
-
20
- ant
21
-
22
- を実行します。
23
- 環境変数HADOOP_HOMEを設定する必要があります。
24
- 想定しているHadoopのバージョンは0.19.2です。
25
-
26
- == Author
27
- Koichi Fujikawa <fujibee@gmail.com>
28
-
29
- == Copyright
30
- License: Apache License