RubyGems - jruby-on-hadoop - Versions diffs - 0.0.2 → 0.0.3 - Mend

jruby-on-hadoop 0.0.2 → 0.0.3

Files changed (16) hide show

data/README.rdoc +47 -0
data/VERSION +1 -1
data/build.xml +21 -9
data/examples/wordcount.rb +18 -0
data/jruby-on-hadoop.gemspec +15 -6
data/lib/hadoop-ruby.jar +0 -0
data/lib/jruby-on-hadoop.rb +10 -1
data/lib/jruby-on-hadoop/client.rb +39 -7
data/lib/ruby_wrapper.rb +59 -0
data/spec/jruby-on-hadoop_spec.rb +37 -6
data/spec/ruby_wrapper_spec.rb +30 -0
data/test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java +18 -0
data/test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java +31 -0
data/test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java +36 -0
metadata +13 -4
data/README +0 -30

data/README.rdoc ADDED Viewed

@@ -0,0 +1,47 @@
+= JRuby on Hadoop
+JRuby on Hadoop is a thin wrapper for Hadoop Mapper / Reducer by JRuby.
+== Install
+Required gems are all on GemCutter.
+1. Upgrade your rubygem to 1.3.5
+2. Install gems
+ $ gem install jruby-on-hadoop
+== Description
+1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
+2. put files into your hdfs. ex) test/inputs/file1
+3. Now you can run 'joh' like below:
+ $ joh examples/wordcount.rb test/inputs test/outputs
+You can get Hadoop job results in your hdfs test/outputs/part-*
+Script example. (see also examples/wordcount.rb)
+ def setup(conf)
+   # setup jobconf
+ end
+ def map(script, key, value, output, reporter)
+   # mapper process
+ end
+ def reduce(script, key, values, output, reporter)
+   # reducer process
+ end
+== Build
+You can build hadoop-ruby.jar by "ant".
+ ant
+Required to set env HADOOP_HOME for your system.
+Assumed Hadoop version is 0.19.2.
+== Author
+Koichi Fujikawa <fujibee@gmail.com>
+== Copyright
+License: Apache License

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.2
1	+ 0.0.3

data/build.xml CHANGED Viewed

@@ -9,10 +9,10 @@
      ====================================================================== -->
 <project name="build JRuby map reduce" default="jar">
-    <property environment="env" />
-    <property name="hadoop.home" value="${env.HADOOP_HOME}" />
-	<property name="version" value="0.0.1" />
+	<property environment="env" />
+	<property name="hadoop.home" value="${env.HADOOP_HOME}" />
+	<property name="version" value="0.0.2" />
 	<property name="src.dir" value="${basedir}/src/java" />
 	<property name="build.dir" value="${basedir}/build" />
 	<property name="dist.dir" value="${basedir}/dist" />
@@ -20,7 +20,7 @@
 	<property name="hadoop.version" value="0.19.2" />
 	<property name="hadoop.jar" value="${hadoop.home}/hadoop-${hadoop.version}-core.jar" />
-	<target name="jar" depends="compile">
+	<target name="jar" depends="clean,compile">
 		<mkdir dir="${dist.dir}" />
 		<jar jarfile="${dist.dir}/hadoop-ruby.jar" basedir="${build.dir}">
 			<manifest>
@@ -33,10 +33,22 @@
 		</jar>
 	</target>
-    <target name="compile">
-        <echo message="${hadoop.jar}" />
+	<target name="compile">
+		<echo message="${hadoop.jar}" />
 		<mkdir dir="${build.dir}" />
-        <javac srcdir="${src.dir}" destdir="${build.dir}" classpath="${hadoop.jar}" />
-    </target>
+		<javac srcdir="${src.dir}" destdir="${build.dir}">
+			<classpath>
+				<fileset dir="${hadoop.home}">
+					<include name="*.jar" />
+					<include name="lib/*.jar" />
+				</fileset>
+			</classpath>
+		</javac>
+	</target>
+	<target name="clean">
+		<delete dir="${build.dir}" />
+		<delete dir="${dist.dir}" />
+	</target>
 </project>

data/examples/wordcount.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# wordcount example
+# not necessary
+def setup(conf)
+  ['test/inputs', 'test/outputs']
+end
+def map(key, value, output, reporter)
+  value.split.each do |word|
+    output.collect(word, 1)
+  end
+end
+def reduce(key, values, output, reporter)
+  sum = 0
+  values.each {|v| sum += v }
+  output.collect(key, sum)
+end

data/jruby-on-hadoop.gemspec CHANGED Viewed

@@ -5,28 +5,35 @@
 Gem::Specification.new do |s|
   s.name = %q{jruby-on-hadoop}
-  s.version = "0.0.2"
+  s.version = "0.0.3"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Koichi Fujikawa"]
-  s.date = %q{2009-12-28}
+  s.date = %q{2010-01-03}
   s.default_executable = %q{joh}
   s.description = %q{JRuby on Hadoop}
   s.email = %q{fujibee@gmail.com}
   s.executables = ["joh"]
   s.extra_rdoc_files = [
-    "README"
+    "README.rdoc"
   ]
   s.files = [
-    "README",
+    "README.rdoc",
      "Rakefile",
      "VERSION",
+     "bin/joh",
      "build.xml",
+     "examples/wordcount.rb",
      "jruby-on-hadoop.gemspec",
      "lib/hadoop-ruby.jar",
      "lib/jruby-on-hadoop.rb",
      "lib/jruby-on-hadoop/client.rb",
-     "spec/jruby-on-hadoop_spec.rb"
+     "lib/ruby_wrapper.rb",
+     "spec/jruby-on-hadoop_spec.rb",
+     "spec/ruby_wrapper_spec.rb",
+     "test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java",
+     "test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java",
+     "test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java"
   ]
   s.homepage = %q{http://github.com/fujibee/jruby-on-hadoop}
   s.rdoc_options = ["--charset=UTF-8"]
@@ -34,7 +41,9 @@ Gem::Specification.new do |s|
   s.rubygems_version = %q{1.3.5}
   s.summary = %q{JRuby on Hadoop}
   s.test_files = [
-    "spec/jruby-on-hadoop_spec.rb"
+    "spec/jruby-on-hadoop_spec.rb",
+     "spec/ruby_wrapper_spec.rb",
+     "examples/wordcount.rb"
   ]
   if s.respond_to? :specification_version then

data/lib/hadoop-ruby.jar CHANGED Viewed

Binary file

data/lib/jruby-on-hadoop.rb CHANGED Viewed

@@ -2,7 +2,16 @@ require 'jruby-jars'
 require 'jruby-on-hadoop/client'
 module JRubyOnHadoop
+  def self.lib_path
+    File.expand_path(File.dirname(__FILE__))
+  end
   def self.jar_path
-    File.join(File.expand_path(File.dirname(__FILE__)), "hadoop-ruby.jar")
+    File.join(lib_path, "hadoop-ruby.jar")
+  end
+  def self.wrapper_ruby_file
+    File.join(lib_path, "ruby_wrapper.rb")
   end
 end

data/lib/jruby-on-hadoop/client.rb CHANGED Viewed

@@ -2,9 +2,17 @@ module JRubyOnHadoop
   JAVA_MAIN_CLASS = 'org.apache.hadoop.ruby.JRubyJobRunner'
   class Client
-    def initialize(argv=[])
-      @init_script = argv[0] || 'mapred.rb'
-      @args = argv[1..argv.size-1].join(" ") if argv.size > 0
+    attr_reader :script, :inputs, :outputs, :files
+    def initialize(args=[])
+      @args = args
+      parse_args
+      # env check
+      hadoop_home = ENV['HADOOP_HOME']
+      raise 'HADOOP_HOME is not set' unless hadoop_home
+      @hadoop_cmd = "#{hadoop_home}/bin/hadoop"
+      ENV['HADOOP_CLASSPATH'] = "#{lib_path}:#{File.dirname(@script_path)}"
     end
     def run
@@ -12,16 +20,40 @@ module JRubyOnHadoop
     end
     def cmd
-      "hadoop jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
-      " -libjars #{jruby_jar_paths} -files #{@init_script} #{@args}"
+      "#{@hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
+      " -libjars #{opt_libjars} -files #{opt_files} #{mapred_args}"
+    end
+    def parse_args
+      @script_path = @args.size > 0 ? @args[0] : 'mapred.rb'
+      @script = File.basename(@script_path)
+      @inputs = @args[1] if @args.size == 3
+      @outputs = @args[2] if @args.size == 3
+      @files = [@script_path, JRubyOnHadoop.wrapper_ruby_file]
+    end
+    def mapred_args
+      args = "--script #{@script} "
+      args += "#{@inputs} " if @inputs
+      args += "#{@outputs}" if @outputs
+      args
+    end
+    def opt_libjars
+      # jruby jars
+      [JRubyJars.core_jar_path, JRubyJars.stdlib_jar_path].join(',')
+    end
+    def opt_files
+      @files.join(',')
     end
     def main_jar_path
       JRubyOnHadoop.jar_path
     end
-    def jruby_jar_paths
-      [JRubyJars.core_jar_path, JRubyJars.stdlib_jar_path].join(',')
+    def lib_path
+      JRubyOnHadoop.lib_path
     end
   end
 end

data/lib/ruby_wrapper.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require 'java'
+import 'org.apache.hadoop.io.IntWritable'
+import 'org.apache.hadoop.io.LongWritable'
+import 'org.apache.hadoop.io.Text'
+def wrap_setup(conf, script, dslfile)
+  require script
+  paths = dslfile ? setup(conf, dslfile) : setup(conf)
+  paths.to_java if paths
+end
+def wrap_map(key, value, output, reporter, script, dslfile)
+  require script
+  output_wrapper = OutputWrapper.new(output)
+  dslfile ?
+    map(to_ruby(key), to_ruby(value), output_wrapper, reporter, dslfile) :
+    map(to_ruby(key), to_ruby(value), output_wrapper, reporter)
+end
+def wrap_reduce(key, values, output, reporter, script, dslfile)
+  require script
+  output_wrapper = OutputWrapper.new(output)
+  dslfile ?
+    reduce(to_ruby(key), to_ruby(values), output_wrapper, reporter, dslfile) :
+    reduce(to_ruby(key), to_ruby(values), output_wrapper, reporter)
+end
+class OutputWrapper
+  def initialize(output)
+    @output = output
+  end
+  def collect(key, value)
+    @output.collect(to_java(key), to_java(value))
+  end
+end
+def to_ruby(value)
+  case value
+  when IntWritable, LongWritable then value.get
+  when Text then value.to_string
+  else
+    # for Java array
+    if value.respond_to? :map
+      value.map {|v| to_ruby(v)}
+    else value # as is
+    end
+  end
+end
+def to_java(value)
+  case value
+  when Integer then IntWritable.new(value)
+  when String then t = Text.new; t.set(value); t
+  when Array then value.to_java
+  else raise "no match class: #{value.class}"
+  end
+end

data/spec/jruby-on-hadoop_spec.rb CHANGED Viewed

@@ -6,21 +6,52 @@ describe JRubyOnHadoop do
     jar_path = File.join(File.expand_path(jar_dir), 'hadoop-ruby.jar')
     JRubyOnHadoop.jar_path.should == jar_path
   end
+  it 'should return lib path' do
+    lib_dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
+    JRubyOnHadoop.lib_path.should == lib_dir
+  end
+  it 'should return wrapper ruby file' do
+    dir = File.join(File.dirname(__FILE__), '..', 'lib')
+    path = File.join(File.expand_path(dir), 'ruby_wrapper.rb')
+    JRubyOnHadoop.wrapper_ruby_file.should == path
+  end
 end
 describe JRubyOnHadoop::Client do
+  before do
+    @client = JRubyOnHadoop::Client.new
+  end
   it 'gather necessary jar paths' do
     version_pattern = '[\d\.]*'
-    client = JRubyOnHadoop::Client.new
-    client.main_jar_path.should include 'hadoop-ruby.jar'
+    @client.main_jar_path.should include 'hadoop-ruby.jar'
-    client.jruby_jar_paths.should match /jruby\-core\-#{version_pattern}\.jar/
-    client.jruby_jar_paths.should match /jruby\-stdlib\-#{version_pattern}\.jar/
+    @client.opt_libjars.should match /jruby\-core\-#{version_pattern}\.jar/
+    @client.opt_libjars.should match /jruby\-stdlib\-#{version_pattern}\.jar/
+  end
+  it 'gather necessary ruby files' do
+    @client.opt_files.split(",").should include "mapred.rb"
+    @client.opt_files.should match /ruby_wrapper\.rb/
   end
   it 'construct command for running hadoop' do
     path_pattern = '[\w/\-\.,]*'
-    client = JRubyOnHadoop::Client.new
-    client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
+    @client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
+  end
+  it 'can get mapred args' do
+    client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "inputs", "outputs"])
+    client.mapred_args.should == "--script mapred.rb inputs outputs"
+  end
+  it 'can parse args' do
+    client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "in", "out"])
+    client.script.should == 'mapred.rb'
+    client.inputs.should == 'in'
+    client.outputs.should == 'out'
+    client.files.should include 'examples/mapred.rb'
   end
 end

data/spec/ruby_wrapper_spec.rb ADDED Viewed

@@ -0,0 +1,30 @@
+require 'ruby_wrapper'
+describe 'wrapper' do
+  before do
+    examples_dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'examples'))
+    $: << examples_dir
+    @script = 'mapred.rb'
+    @output, @repoter = mock('output'), mock('repoter')
+    @key, @value = Text.new, Text.new
+    @key.set('key')
+    @value.set('value')
+  end
+  it 'can wrap setup' do
+    wrap_setup(mock('conf'), @script, nil)
+  end
+  it 'can wrap mapper' do
+    @output.should_receive(:collect).once
+    wrap_map(@key, @value, @output, @reporter, @script, nil)
+  end
+  it 'can wrap reducer' do
+    @output.should_receive(:collect).once
+    values = [1, 2, 3].map {|v| IntWritable.new(v)}.to_java
+    wrap_reduce(@key, values, @output, @reporter, @script, nil)
+  end
+end

data/test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java ADDED Viewed

@@ -0,0 +1,18 @@
+package org.apache.hadoop.ruby;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+public class JRubyJobRunnerTest {
+	@Test
+	public void testRun() throws Exception {
+		JRubyJobRunner runner = new JRubyJobRunner();
+		Configuration conf = new Configuration();
+		runner.setConf(conf);
+		String[] args = { "--script", "mapred.rb", "inputs", "outputs" };
+		try {
+			runner.run(args);
+		} catch (Throwable t) { /* ignore */ }
+	}
+}

data/test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java ADDED Viewed

@@ -0,0 +1,31 @@
+package org.apache.hadoop.ruby.mapred;
+import java.io.IOException;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.junit.Test;
+public class JRubyMapperTest {
+	@Test
+	public void testMap() throws IOException {
+		LongWritable key = new LongWritable(0L);
+		Text value = new Text();
+		value.set("value");
+		JRubyMapper mapper = new JRubyMapper();
+		JobConf conf = new JobConf();
+		conf.set("mapred.ruby.script", "mapred.rb");
+		mapper.configure(conf);
+		try {
+			mapper.map(key, value, null, null);
+		} catch (Throwable t) {
+			// ignore
+			// TODO mock check
+		}
+	}
+}

data/test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java ADDED Viewed

@@ -0,0 +1,36 @@
+package org.apache.hadoop.ruby.mapred;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.junit.Test;
+public class JRubyReducerTest {
+	@Test
+	public void testReduce() throws IOException {
+		Text key = new Text();
+		key.set("key");
+		List<IntWritable> values = new ArrayList<IntWritable>();
+		values.add(new IntWritable(1));
+		values.add(new IntWritable(2));
+		values.add(new IntWritable(3));
+		JRubyReducer reducer = new JRubyReducer();
+		JobConf conf = new JobConf();
+		conf.set("mapred.ruby.script", "mapred.rb");
+		reducer.configure(conf);
+		try {
+			reducer.reduce(key, values.iterator(), null, null);
+		} catch (Throwable t) {
+			// ignore
+			// TODO mock check
+		}
+	}
+}

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: jruby-on-hadoop
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-12-28 00:00:00 +09:00
+date: 2010-01-03 00:00:00 +09:00
 default_executable: joh
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -29,17 +29,24 @@ executables:
 extensions: []
 extra_rdoc_files:
-- README
+- README.rdoc
 files:
-- README
+- README.rdoc
 - Rakefile
 - VERSION
+- bin/joh
 - build.xml
+- examples/wordcount.rb
 - jruby-on-hadoop.gemspec
 - lib/hadoop-ruby.jar
 - lib/jruby-on-hadoop.rb
 - lib/jruby-on-hadoop/client.rb
+- lib/ruby_wrapper.rb
 - spec/jruby-on-hadoop_spec.rb
+- spec/ruby_wrapper_spec.rb
+- test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java
+- test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java
+- test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java
 has_rdoc: true
 homepage: http://github.com/fujibee/jruby-on-hadoop
 licenses: []
@@ -70,3 +77,5 @@ specification_version: 3
 summary: JRuby on Hadoop
 test_files:
 - spec/jruby-on-hadoop_spec.rb
+- spec/ruby_wrapper_spec.rb
+- examples/wordcount.rb

data/README DELETED Viewed

@@ -1,30 +0,0 @@
-= hadoop-ruby
-== Description
-HadoopのMapper/ReducerをRubyで記述することができます。
-hadoop-rubydslのためのJRubyラッパーです。
-例）
-init.rb
-def map(script, key, value, output, reporter)
-  # map処理
-end
-def reduce(script, key, values, output, reporter)
-  # reduce処理
-end
-== Build
-ant
-を実行します。
-環境変数HADOOP_HOMEを設定する必要があります。
-想定しているHadoopのバージョンは0.19.2です。
-== Author
-Koichi Fujikawa <fujibee@gmail.com>
-== Copyright
-License: Apache License