jruby-on-hadoop 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc ADDED
@@ -0,0 +1,47 @@
1
+ = JRuby on Hadoop
2
+
3
+ JRuby on Hadoop is a thin wrapper for Hadoop Mapper / Reducer by JRuby.
4
+
5
+ == Install
6
+
7
+ Required gems are all on GemCutter.
8
+
9
+ 1. Upgrade your rubygem to 1.3.5
10
+ 2. Install gems
11
+ $ gem install jruby-on-hadoop
12
+
13
+ == Description
14
+
15
+ 1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
16
+ 2. put files into your hdfs. ex) test/inputs/file1
17
+ 3. Now you can run 'joh' like below:
18
+ $ joh examples/wordcount.rb test/inputs test/outputs
19
+ You can get Hadoop job results in your hdfs test/outputs/part-*
20
+
21
+ Script example. (see also examples/wordcount.rb)
22
+
23
+ def setup(conf)
24
+ # setup jobconf
25
+ end
26
+
27
+ def map(script, key, value, output, reporter)
28
+ # mapper process
29
+ end
30
+
31
+ def reduce(script, key, values, output, reporter)
32
+ # reducer process
33
+ end
34
+
35
+ == Build
36
+
37
+ You can build hadoop-ruby.jar by "ant".
38
+ ant
39
+
40
+ Required to set env HADOOP_HOME for your system.
41
+ Assumed Hadoop version is 0.19.2.
42
+
43
+ == Author
44
+ Koichi Fujikawa <fujibee@gmail.com>
45
+
46
+ == Copyright
47
+ License: Apache License
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.2
1
+ 0.0.3
data/build.xml CHANGED
@@ -9,10 +9,10 @@
9
9
  ====================================================================== -->
10
10
  <project name="build JRuby map reduce" default="jar">
11
11
 
12
- <property environment="env" />
13
- <property name="hadoop.home" value="${env.HADOOP_HOME}" />
14
-
15
- <property name="version" value="0.0.1" />
12
+ <property environment="env" />
13
+ <property name="hadoop.home" value="${env.HADOOP_HOME}" />
14
+
15
+ <property name="version" value="0.0.2" />
16
16
  <property name="src.dir" value="${basedir}/src/java" />
17
17
  <property name="build.dir" value="${basedir}/build" />
18
18
  <property name="dist.dir" value="${basedir}/dist" />
@@ -20,7 +20,7 @@
20
20
  <property name="hadoop.version" value="0.19.2" />
21
21
  <property name="hadoop.jar" value="${hadoop.home}/hadoop-${hadoop.version}-core.jar" />
22
22
 
23
- <target name="jar" depends="compile">
23
+ <target name="jar" depends="clean,compile">
24
24
  <mkdir dir="${dist.dir}" />
25
25
  <jar jarfile="${dist.dir}/hadoop-ruby.jar" basedir="${build.dir}">
26
26
  <manifest>
@@ -33,10 +33,22 @@
33
33
  </jar>
34
34
  </target>
35
35
 
36
- <target name="compile">
37
- <echo message="${hadoop.jar}" />
36
+ <target name="compile">
37
+ <echo message="${hadoop.jar}" />
38
38
  <mkdir dir="${build.dir}" />
39
- <javac srcdir="${src.dir}" destdir="${build.dir}" classpath="${hadoop.jar}" />
40
- </target>
39
+ <javac srcdir="${src.dir}" destdir="${build.dir}">
40
+ <classpath>
41
+ <fileset dir="${hadoop.home}">
42
+ <include name="*.jar" />
43
+ <include name="lib/*.jar" />
44
+ </fileset>
45
+ </classpath>
46
+ </javac>
47
+ </target>
48
+
49
+ <target name="clean">
50
+ <delete dir="${build.dir}" />
51
+ <delete dir="${dist.dir}" />
52
+ </target>
41
53
 
42
54
  </project>
@@ -0,0 +1,18 @@
1
+ # wordcount example
2
+
3
+ # not necessary
4
+ def setup(conf)
5
+ ['test/inputs', 'test/outputs']
6
+ end
7
+
8
+ def map(key, value, output, reporter)
9
+ value.split.each do |word|
10
+ output.collect(word, 1)
11
+ end
12
+ end
13
+
14
+ def reduce(key, values, output, reporter)
15
+ sum = 0
16
+ values.each {|v| sum += v }
17
+ output.collect(key, sum)
18
+ end
@@ -5,28 +5,35 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{jruby-on-hadoop}
8
- s.version = "0.0.2"
8
+ s.version = "0.0.3"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Koichi Fujikawa"]
12
- s.date = %q{2009-12-28}
12
+ s.date = %q{2010-01-03}
13
13
  s.default_executable = %q{joh}
14
14
  s.description = %q{JRuby on Hadoop}
15
15
  s.email = %q{fujibee@gmail.com}
16
16
  s.executables = ["joh"]
17
17
  s.extra_rdoc_files = [
18
- "README"
18
+ "README.rdoc"
19
19
  ]
20
20
  s.files = [
21
- "README",
21
+ "README.rdoc",
22
22
  "Rakefile",
23
23
  "VERSION",
24
+ "bin/joh",
24
25
  "build.xml",
26
+ "examples/wordcount.rb",
25
27
  "jruby-on-hadoop.gemspec",
26
28
  "lib/hadoop-ruby.jar",
27
29
  "lib/jruby-on-hadoop.rb",
28
30
  "lib/jruby-on-hadoop/client.rb",
29
- "spec/jruby-on-hadoop_spec.rb"
31
+ "lib/ruby_wrapper.rb",
32
+ "spec/jruby-on-hadoop_spec.rb",
33
+ "spec/ruby_wrapper_spec.rb",
34
+ "test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java",
35
+ "test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java",
36
+ "test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java"
30
37
  ]
31
38
  s.homepage = %q{http://github.com/fujibee/jruby-on-hadoop}
32
39
  s.rdoc_options = ["--charset=UTF-8"]
@@ -34,7 +41,9 @@ Gem::Specification.new do |s|
34
41
  s.rubygems_version = %q{1.3.5}
35
42
  s.summary = %q{JRuby on Hadoop}
36
43
  s.test_files = [
37
- "spec/jruby-on-hadoop_spec.rb"
44
+ "spec/jruby-on-hadoop_spec.rb",
45
+ "spec/ruby_wrapper_spec.rb",
46
+ "examples/wordcount.rb"
38
47
  ]
39
48
 
40
49
  if s.respond_to? :specification_version then
data/lib/hadoop-ruby.jar CHANGED
Binary file
@@ -2,7 +2,16 @@ require 'jruby-jars'
2
2
  require 'jruby-on-hadoop/client'
3
3
 
4
4
  module JRubyOnHadoop
5
+
6
+ def self.lib_path
7
+ File.expand_path(File.dirname(__FILE__))
8
+ end
9
+
5
10
  def self.jar_path
6
- File.join(File.expand_path(File.dirname(__FILE__)), "hadoop-ruby.jar")
11
+ File.join(lib_path, "hadoop-ruby.jar")
12
+ end
13
+
14
+ def self.wrapper_ruby_file
15
+ File.join(lib_path, "ruby_wrapper.rb")
7
16
  end
8
17
  end
@@ -2,9 +2,17 @@ module JRubyOnHadoop
2
2
  JAVA_MAIN_CLASS = 'org.apache.hadoop.ruby.JRubyJobRunner'
3
3
 
4
4
  class Client
5
- def initialize(argv=[])
6
- @init_script = argv[0] || 'mapred.rb'
7
- @args = argv[1..argv.size-1].join(" ") if argv.size > 0
5
+ attr_reader :script, :inputs, :outputs, :files
6
+
7
+ def initialize(args=[])
8
+ @args = args
9
+ parse_args
10
+
11
+ # env check
12
+ hadoop_home = ENV['HADOOP_HOME']
13
+ raise 'HADOOP_HOME is not set' unless hadoop_home
14
+ @hadoop_cmd = "#{hadoop_home}/bin/hadoop"
15
+ ENV['HADOOP_CLASSPATH'] = "#{lib_path}:#{File.dirname(@script_path)}"
8
16
  end
9
17
 
10
18
  def run
@@ -12,16 +20,40 @@ module JRubyOnHadoop
12
20
  end
13
21
 
14
22
  def cmd
15
- "hadoop jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
16
- " -libjars #{jruby_jar_paths} -files #{@init_script} #{@args}"
23
+ "#{@hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
24
+ " -libjars #{opt_libjars} -files #{opt_files} #{mapred_args}"
25
+ end
26
+
27
+ def parse_args
28
+ @script_path = @args.size > 0 ? @args[0] : 'mapred.rb'
29
+ @script = File.basename(@script_path)
30
+ @inputs = @args[1] if @args.size == 3
31
+ @outputs = @args[2] if @args.size == 3
32
+ @files = [@script_path, JRubyOnHadoop.wrapper_ruby_file]
33
+ end
34
+
35
+ def mapred_args
36
+ args = "--script #{@script} "
37
+ args += "#{@inputs} " if @inputs
38
+ args += "#{@outputs}" if @outputs
39
+ args
40
+ end
41
+
42
+ def opt_libjars
43
+ # jruby jars
44
+ [JRubyJars.core_jar_path, JRubyJars.stdlib_jar_path].join(',')
45
+ end
46
+
47
+ def opt_files
48
+ @files.join(',')
17
49
  end
18
50
 
19
51
  def main_jar_path
20
52
  JRubyOnHadoop.jar_path
21
53
  end
22
54
 
23
- def jruby_jar_paths
24
- [JRubyJars.core_jar_path, JRubyJars.stdlib_jar_path].join(',')
55
+ def lib_path
56
+ JRubyOnHadoop.lib_path
25
57
  end
26
58
  end
27
59
  end
@@ -0,0 +1,59 @@
1
+ require 'java'
2
+
3
+ import 'org.apache.hadoop.io.IntWritable'
4
+ import 'org.apache.hadoop.io.LongWritable'
5
+ import 'org.apache.hadoop.io.Text'
6
+
7
+ def wrap_setup(conf, script, dslfile)
8
+ require script
9
+ paths = dslfile ? setup(conf, dslfile) : setup(conf)
10
+ paths.to_java if paths
11
+ end
12
+
13
+ def wrap_map(key, value, output, reporter, script, dslfile)
14
+ require script
15
+ output_wrapper = OutputWrapper.new(output)
16
+ dslfile ?
17
+ map(to_ruby(key), to_ruby(value), output_wrapper, reporter, dslfile) :
18
+ map(to_ruby(key), to_ruby(value), output_wrapper, reporter)
19
+ end
20
+
21
+ def wrap_reduce(key, values, output, reporter, script, dslfile)
22
+ require script
23
+ output_wrapper = OutputWrapper.new(output)
24
+ dslfile ?
25
+ reduce(to_ruby(key), to_ruby(values), output_wrapper, reporter, dslfile) :
26
+ reduce(to_ruby(key), to_ruby(values), output_wrapper, reporter)
27
+ end
28
+
29
+ class OutputWrapper
30
+ def initialize(output)
31
+ @output = output
32
+ end
33
+
34
+ def collect(key, value)
35
+ @output.collect(to_java(key), to_java(value))
36
+ end
37
+ end
38
+
39
+ def to_ruby(value)
40
+ case value
41
+ when IntWritable, LongWritable then value.get
42
+ when Text then value.to_string
43
+ else
44
+ # for Java array
45
+ if value.respond_to? :map
46
+ value.map {|v| to_ruby(v)}
47
+ else value # as is
48
+ end
49
+ end
50
+ end
51
+
52
+ def to_java(value)
53
+ case value
54
+ when Integer then IntWritable.new(value)
55
+ when String then t = Text.new; t.set(value); t
56
+ when Array then value.to_java
57
+ else raise "no match class: #{value.class}"
58
+ end
59
+ end
@@ -6,21 +6,52 @@ describe JRubyOnHadoop do
6
6
  jar_path = File.join(File.expand_path(jar_dir), 'hadoop-ruby.jar')
7
7
  JRubyOnHadoop.jar_path.should == jar_path
8
8
  end
9
+
10
+ it 'should return lib path' do
11
+ lib_dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
12
+ JRubyOnHadoop.lib_path.should == lib_dir
13
+ end
14
+
15
+ it 'should return wrapper ruby file' do
16
+ dir = File.join(File.dirname(__FILE__), '..', 'lib')
17
+ path = File.join(File.expand_path(dir), 'ruby_wrapper.rb')
18
+ JRubyOnHadoop.wrapper_ruby_file.should == path
19
+ end
9
20
  end
10
21
 
11
22
  describe JRubyOnHadoop::Client do
23
+ before do
24
+ @client = JRubyOnHadoop::Client.new
25
+ end
26
+
12
27
  it 'gather necessary jar paths' do
13
28
  version_pattern = '[\d\.]*'
14
- client = JRubyOnHadoop::Client.new
15
- client.main_jar_path.should include 'hadoop-ruby.jar'
29
+ @client.main_jar_path.should include 'hadoop-ruby.jar'
16
30
 
17
- client.jruby_jar_paths.should match /jruby\-core\-#{version_pattern}\.jar/
18
- client.jruby_jar_paths.should match /jruby\-stdlib\-#{version_pattern}\.jar/
31
+ @client.opt_libjars.should match /jruby\-core\-#{version_pattern}\.jar/
32
+ @client.opt_libjars.should match /jruby\-stdlib\-#{version_pattern}\.jar/
33
+ end
34
+
35
+ it 'gather necessary ruby files' do
36
+ @client.opt_files.split(",").should include "mapred.rb"
37
+ @client.opt_files.should match /ruby_wrapper\.rb/
19
38
  end
20
39
 
21
40
  it 'construct command for running hadoop' do
22
41
  path_pattern = '[\w/\-\.,]*'
23
- client = JRubyOnHadoop::Client.new
24
- client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
42
+ @client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
43
+ end
44
+
45
+ it 'can get mapred args' do
46
+ client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "inputs", "outputs"])
47
+ client.mapred_args.should == "--script mapred.rb inputs outputs"
48
+ end
49
+
50
+ it 'can parse args' do
51
+ client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "in", "out"])
52
+ client.script.should == 'mapred.rb'
53
+ client.inputs.should == 'in'
54
+ client.outputs.should == 'out'
55
+ client.files.should include 'examples/mapred.rb'
25
56
  end
26
57
  end
@@ -0,0 +1,30 @@
1
+ require 'ruby_wrapper'
2
+
3
+ describe 'wrapper' do
4
+ before do
5
+ examples_dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'examples'))
6
+ $: << examples_dir
7
+
8
+ @script = 'mapred.rb'
9
+ @output, @repoter = mock('output'), mock('repoter')
10
+
11
+ @key, @value = Text.new, Text.new
12
+ @key.set('key')
13
+ @value.set('value')
14
+ end
15
+
16
+ it 'can wrap setup' do
17
+ wrap_setup(mock('conf'), @script, nil)
18
+ end
19
+
20
+ it 'can wrap mapper' do
21
+ @output.should_receive(:collect).once
22
+ wrap_map(@key, @value, @output, @reporter, @script, nil)
23
+ end
24
+
25
+ it 'can wrap reducer' do
26
+ @output.should_receive(:collect).once
27
+ values = [1, 2, 3].map {|v| IntWritable.new(v)}.to_java
28
+ wrap_reduce(@key, values, @output, @reporter, @script, nil)
29
+ end
30
+ end
@@ -0,0 +1,18 @@
1
+ package org.apache.hadoop.ruby;
2
+
3
+ import org.apache.hadoop.conf.Configuration;
4
+ import org.junit.Test;
5
+
6
+ public class JRubyJobRunnerTest {
7
+
8
+ @Test
9
+ public void testRun() throws Exception {
10
+ JRubyJobRunner runner = new JRubyJobRunner();
11
+ Configuration conf = new Configuration();
12
+ runner.setConf(conf);
13
+ String[] args = { "--script", "mapred.rb", "inputs", "outputs" };
14
+ try {
15
+ runner.run(args);
16
+ } catch (Throwable t) { /* ignore */ }
17
+ }
18
+ }
@@ -0,0 +1,31 @@
1
+ package org.apache.hadoop.ruby.mapred;
2
+
3
+ import java.io.IOException;
4
+
5
+ import org.apache.hadoop.io.LongWritable;
6
+ import org.apache.hadoop.io.Text;
7
+ import org.apache.hadoop.mapred.JobConf;
8
+ import org.junit.Test;
9
+
10
+ public class JRubyMapperTest {
11
+
12
+ @Test
13
+ public void testMap() throws IOException {
14
+ LongWritable key = new LongWritable(0L);
15
+ Text value = new Text();
16
+ value.set("value");
17
+
18
+ JRubyMapper mapper = new JRubyMapper();
19
+ JobConf conf = new JobConf();
20
+ conf.set("mapred.ruby.script", "mapred.rb");
21
+ mapper.configure(conf);
22
+
23
+ try {
24
+ mapper.map(key, value, null, null);
25
+ } catch (Throwable t) {
26
+ // ignore
27
+ // TODO mock check
28
+ }
29
+ }
30
+
31
+ }
@@ -0,0 +1,36 @@
1
+ package org.apache.hadoop.ruby.mapred;
2
+
3
+ import java.io.IOException;
4
+ import java.util.ArrayList;
5
+ import java.util.List;
6
+
7
+ import org.apache.hadoop.io.IntWritable;
8
+ import org.apache.hadoop.io.Text;
9
+ import org.apache.hadoop.mapred.JobConf;
10
+ import org.junit.Test;
11
+
12
+ public class JRubyReducerTest {
13
+
14
+ @Test
15
+ public void testReduce() throws IOException {
16
+ Text key = new Text();
17
+ key.set("key");
18
+ List<IntWritable> values = new ArrayList<IntWritable>();
19
+ values.add(new IntWritable(1));
20
+ values.add(new IntWritable(2));
21
+ values.add(new IntWritable(3));
22
+
23
+ JRubyReducer reducer = new JRubyReducer();
24
+ JobConf conf = new JobConf();
25
+ conf.set("mapred.ruby.script", "mapred.rb");
26
+ reducer.configure(conf);
27
+
28
+ try {
29
+ reducer.reduce(key, values.iterator(), null, null);
30
+ } catch (Throwable t) {
31
+ // ignore
32
+ // TODO mock check
33
+ }
34
+ }
35
+
36
+ }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jruby-on-hadoop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-12-28 00:00:00 +09:00
12
+ date: 2010-01-03 00:00:00 +09:00
13
13
  default_executable: joh
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -29,17 +29,24 @@ executables:
29
29
  extensions: []
30
30
 
31
31
  extra_rdoc_files:
32
- - README
32
+ - README.rdoc
33
33
  files:
34
- - README
34
+ - README.rdoc
35
35
  - Rakefile
36
36
  - VERSION
37
+ - bin/joh
37
38
  - build.xml
39
+ - examples/wordcount.rb
38
40
  - jruby-on-hadoop.gemspec
39
41
  - lib/hadoop-ruby.jar
40
42
  - lib/jruby-on-hadoop.rb
41
43
  - lib/jruby-on-hadoop/client.rb
44
+ - lib/ruby_wrapper.rb
42
45
  - spec/jruby-on-hadoop_spec.rb
46
+ - spec/ruby_wrapper_spec.rb
47
+ - test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java
48
+ - test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java
49
+ - test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java
43
50
  has_rdoc: true
44
51
  homepage: http://github.com/fujibee/jruby-on-hadoop
45
52
  licenses: []
@@ -70,3 +77,5 @@ specification_version: 3
70
77
  summary: JRuby on Hadoop
71
78
  test_files:
72
79
  - spec/jruby-on-hadoop_spec.rb
80
+ - spec/ruby_wrapper_spec.rb
81
+ - examples/wordcount.rb
data/README DELETED
@@ -1,30 +0,0 @@
1
- = hadoop-ruby
2
-
3
- == Description
4
- HadoopのMapper/ReducerをRubyで記述することができます。
5
- hadoop-rubydslのためのJRubyラッパーです。
6
-
7
- 例)
8
- init.rb
9
-
10
- def map(script, key, value, output, reporter)
11
- # map処理
12
- end
13
-
14
- def reduce(script, key, values, output, reporter)
15
- # reduce処理
16
- end
17
-
18
- == Build
19
-
20
- ant
21
-
22
- を実行します。
23
- 環境変数HADOOP_HOMEを設定する必要があります。
24
- 想定しているHadoopのバージョンは0.19.2です。
25
-
26
- == Author
27
- Koichi Fujikawa <fujibee@gmail.com>
28
-
29
- == Copyright
30
- License: Apache License