RubyGems - jruby-on-hadoop - Versions diffs - 0.0.2 → 0.0.3 - Mend

jruby-on-hadoop 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/README.rdoc +47 -0
data/VERSION +1 -1
data/build.xml +21 -9
data/examples/wordcount.rb +18 -0
data/jruby-on-hadoop.gemspec +15 -6
data/lib/hadoop-ruby.jar +0 -0
data/lib/jruby-on-hadoop.rb +10 -1
data/lib/jruby-on-hadoop/client.rb +39 -7
data/lib/ruby_wrapper.rb +59 -0
data/spec/jruby-on-hadoop_spec.rb +37 -6
data/spec/ruby_wrapper_spec.rb +30 -0
data/test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java +18 -0
data/test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java +31 -0
data/test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java +36 -0
metadata +13 -4
data/README +0 -30

data/README.rdoc ADDED Viewed

@@ -0,0 +1,47 @@
+= JRuby on Hadoop
+JRuby on Hadoop is a thin wrapper for Hadoop Mapper / Reducer by JRuby.
+== Install
+Required gems are all on GemCutter.
+1. Upgrade your rubygem to 1.3.5
+2. Install gems
+ $ gem install jruby-on-hadoop
+== Description
+1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
+2. put files into your hdfs. ex) test/inputs/file1
+3. Now you can run 'joh' like below:
+ $ joh examples/wordcount.rb test/inputs test/outputs
+You can get Hadoop job results in your hdfs test/outputs/part-*
+Script example. (see also examples/wordcount.rb)
+ def setup(conf)
+   # setup jobconf
+ end
+ def map(script, key, value, output, reporter)
+   # mapper process
+ end
+ def reduce(script, key, values, output, reporter)
+   # reducer process
+ end
+== Build
+You can build hadoop-ruby.jar by "ant".
+ ant
+Required to set env HADOOP_HOME for your system.
+Assumed Hadoop version is 0.19.2.
+== Author
+Koichi Fujikawa <fujibee@gmail.com>
+== Copyright
+License: Apache License

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.2
1	+ 0.0.3

data/build.xml CHANGED Viewed

@@ -9,10 +9,10 @@
      ====================================================================== -->
 <project name="build JRuby map reduce" default="jar">
-    <property environment="env" />
-    <property name="hadoop.home" value="${env.HADOOP_HOME}" />
-	<property name="version" value="0.0.1" />
+	<property environment="env" />
+	<property name="hadoop.home" value="${env.HADOOP_HOME}" />
+	<property name="version" value="0.0.2" />
 	<property name="src.dir" value="${basedir}/src/java" />
 	<property name="build.dir" value="${basedir}/build" />
 	<property name="dist.dir" value="${basedir}/dist" />
@@ -20,7 +20,7 @@
 	<property name="hadoop.version" value="0.19.2" />
 	<property name="hadoop.jar" value="${hadoop.home}/hadoop-${hadoop.version}-core.jar" />
-	<target name="jar" depends="compile">
+	<target name="jar" depends="clean,compile">
 		<mkdir dir="${dist.dir}" />
 		<jar jarfile="${dist.dir}/hadoop-ruby.jar" basedir="${build.dir}">
 			<manifest>
@@ -33,10 +33,22 @@
 		</jar>
 	</target>
-    <target name="compile">
-        <echo message="${hadoop.jar}" />
+	<target name="compile">
+		<echo message="${hadoop.jar}" />
 		<mkdir dir="${build.dir}" />
-        <javac srcdir="${src.dir}" destdir="${build.dir}" classpath="${hadoop.jar}" />
-    </target>
+		<javac srcdir="${src.dir}" destdir="${build.dir}">
+			<classpath>
+				<fileset dir="${hadoop.home}">
+					<include name="*.jar" />
+					<include name="lib/*.jar" />
+				</fileset>
+			</classpath>
+		</javac>
+	</target>
+	<target name="clean">
+		<delete dir="${build.dir}" />
+		<delete dir="${dist.dir}" />
+	</target>
 </project>

data/examples/wordcount.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# wordcount example
+# not necessary
+def setup(conf)
+  ['test/inputs', 'test/outputs']
+end
+def map(key, value, output, reporter)
+  value.split.each do |word|
+    output.collect(word, 1)
+  end
+end
+def reduce(key, values, output, reporter)
+  sum = 0
+  values.each {|v| sum += v }
+  output.collect(key, sum)
+end

data/jruby-on-hadoop.gemspec CHANGED Viewed

@@ -5,28 +5,35 @@
 Gem::Specification.new do |s|
   s.name = %q{jruby-on-hadoop}
-  s.version = "0.0.2"
+  s.version = "0.0.3"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Koichi Fujikawa"]
-  s.date = %q{2009-12-28}
+  s.date = %q{2010-01-03}
   s.default_executable = %q{joh}
   s.description = %q{JRuby on Hadoop}
   s.email = %q{fujibee@gmail.com}
   s.executables = ["joh"]
   s.extra_rdoc_files = [
-    "README"
+    "README.rdoc"
   ]
   s.files = [
-    "README",
+    "README.rdoc",
      "Rakefile",
      "VERSION",
+     "bin/joh",
      "build.xml",
+     "examples/wordcount.rb",
      "jruby-on-hadoop.gemspec",
      "lib/hadoop-ruby.jar",
      "lib/jruby-on-hadoop.rb",
      "lib/jruby-on-hadoop/client.rb",
-     "spec/jruby-on-hadoop_spec.rb"
+     "lib/ruby_wrapper.rb",
+     "spec/jruby-on-hadoop_spec.rb",
+     "spec/ruby_wrapper_spec.rb",
+     "test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java",
+     "test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java",
+     "test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java"
   ]
   s.homepage = %q{http://github.com/fujibee/jruby-on-hadoop}
   s.rdoc_options = ["--charset=UTF-8"]
@@ -34,7 +41,9 @@ Gem::Specification.new do |s|
   s.rubygems_version = %q{1.3.5}
   s.summary = %q{JRuby on Hadoop}
   s.test_files = [
-    "spec/jruby-on-hadoop_spec.rb"
+    "spec/jruby-on-hadoop_spec.rb",
+     "spec/ruby_wrapper_spec.rb",
+     "examples/wordcount.rb"
   ]
   if s.respond_to? :specification_version then

data/lib/hadoop-ruby.jar CHANGED Viewed

Binary file

data/lib/jruby-on-hadoop.rb CHANGED Viewed

@@ -2,7 +2,16 @@ require 'jruby-jars'
 require 'jruby-on-hadoop/client'
 module JRubyOnHadoop
+  def self.lib_path
+    File.expand_path(File.dirname(__FILE__))
+  end
   def self.jar_path
-    File.join(File.expand_path(File.dirname(__FILE__)), "hadoop-ruby.jar")
+    File.join(lib_path, "hadoop-ruby.jar")
+  end
+  def self.wrapper_ruby_file
+    File.join(lib_path, "ruby_wrapper.rb")
   end
 end

data/lib/jruby-on-hadoop/client.rb CHANGED Viewed

@@ -2,9 +2,17 @@ module JRubyOnHadoop
   JAVA_MAIN_CLASS = 'org.apache.hadoop.ruby.JRubyJobRunner'
   class Client
-    def initialize(argv=[])
-      @init_script = argv[0] || 'mapred.rb'
-      @args = argv[1..argv.size-1].join(" ") if argv.size > 0
+    attr_reader :script, :inputs, :outputs, :files
+    def initialize(args=[])
+      @args = args
+      parse_args
+      # env check
+      hadoop_home = ENV['HADOOP_HOME']
+      raise 'HADOOP_HOME is not set' unless hadoop_home
+      @hadoop_cmd = "#{hadoop_home}/bin/hadoop"
+      ENV['HADOOP_CLASSPATH'] = "#{lib_path}:#{File.dirname(@script_path)}"
     end
     def run
@@ -12,16 +20,40 @@ module JRubyOnHadoop
     end
     def cmd
-      "hadoop jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
-      " -libjars #{jruby_jar_paths} -files #{@init_script} #{@args}"
+      "#{@hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS}" +
+      " -libjars #{opt_libjars} -files #{opt_files} #{mapred_args}"
+    end
+    def parse_args
+      @script_path = @args.size > 0 ? @args[0] : 'mapred.rb'
+      @script = File.basename(@script_path)
+      @inputs = @args[1] if @args.size == 3
+      @outputs = @args[2] if @args.size == 3
+      @files = [@script_path, JRubyOnHadoop.wrapper_ruby_file]
+    end
+    def mapred_args
+      args = "--script #{@script} "
+      args += "#{@inputs} " if @inputs
+      args += "#{@outputs}" if @outputs
+      args
+    end
+    def opt_libjars
+      # jruby jars
+      [JRubyJars.core_jar_path, JRubyJars.stdlib_jar_path].join(',')
+    end
+    def opt_files
+      @files.join(',')
     end
     def main_jar_path
       JRubyOnHadoop.jar_path
     end
-    def jruby_jar_paths
-      [JRubyJars.core_jar_path, JRubyJars.stdlib_jar_path].join(',')
+    def lib_path
+      JRubyOnHadoop.lib_path
     end
   end
 end

data/lib/ruby_wrapper.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require 'java'
+import 'org.apache.hadoop.io.IntWritable'
+import 'org.apache.hadoop.io.LongWritable'
+import 'org.apache.hadoop.io.Text'
+def wrap_setup(conf, script, dslfile)
+  require script
+  paths = dslfile ? setup(conf, dslfile) : setup(conf)
+  paths.to_java if paths
+end
+def wrap_map(key, value, output, reporter, script, dslfile)
+  require script
+  output_wrapper = OutputWrapper.new(output)
+  dslfile ?
+    map(to_ruby(key), to_ruby(value), output_wrapper, reporter, dslfile) :
+    map(to_ruby(key), to_ruby(value), output_wrapper, reporter)
+end
+def wrap_reduce(key, values, output, reporter, script, dslfile)
+  require script
+  output_wrapper = OutputWrapper.new(output)
+  dslfile ?
+    reduce(to_ruby(key), to_ruby(values), output_wrapper, reporter, dslfile) :
+    reduce(to_ruby(key), to_ruby(values), output_wrapper, reporter)
+end
+class OutputWrapper
+  def initialize(output)
+    @output = output
+  end
+  def collect(key, value)
+    @output.collect(to_java(key), to_java(value))
+  end
+end
+def to_ruby(value)
+  case value
+  when IntWritable, LongWritable then value.get
+  when Text then value.to_string
+  else
+    # for Java array
+    if value.respond_to? :map
+      value.map {|v| to_ruby(v)}
+    else value # as is
+    end
+  end
+end
+def to_java(value)
+  case value
+  when Integer then IntWritable.new(value)
+  when String then t = Text.new; t.set(value); t
+  when Array then value.to_java
+  else raise "no match class: #{value.class}"
+  end
+end

data/spec/jruby-on-hadoop_spec.rb CHANGED Viewed

@@ -6,21 +6,52 @@ describe JRubyOnHadoop do
     jar_path = File.join(File.expand_path(jar_dir), 'hadoop-ruby.jar')
     JRubyOnHadoop.jar_path.should == jar_path
   end
+  it 'should return lib path' do
+    lib_dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
+    JRubyOnHadoop.lib_path.should == lib_dir
+  end
+  it 'should return wrapper ruby file' do
+    dir = File.join(File.dirname(__FILE__), '..', 'lib')
+    path = File.join(File.expand_path(dir), 'ruby_wrapper.rb')
+    JRubyOnHadoop.wrapper_ruby_file.should == path
+  end
 end
 describe JRubyOnHadoop::Client do
+  before do
+    @client = JRubyOnHadoop::Client.new
+  end
   it 'gather necessary jar paths' do
     version_pattern = '[\d\.]*'
-    client = JRubyOnHadoop::Client.new
-    client.main_jar_path.should include 'hadoop-ruby.jar'
+    @client.main_jar_path.should include 'hadoop-ruby.jar'
-    client.jruby_jar_paths.should match /jruby\-core\-#{version_pattern}\.jar/
-    client.jruby_jar_paths.should match /jruby\-stdlib\-#{version_pattern}\.jar/
+    @client.opt_libjars.should match /jruby\-core\-#{version_pattern}\.jar/
+    @client.opt_libjars.should match /jruby\-stdlib\-#{version_pattern}\.jar/
+  end
+  it 'gather necessary ruby files' do
+    @client.opt_files.split(",").should include "mapred.rb"
+    @client.opt_files.should match /ruby_wrapper\.rb/
   end
   it 'construct command for running hadoop' do
     path_pattern = '[\w/\-\.,]*'
-    client = JRubyOnHadoop::Client.new
-    client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
+    @client.cmd.should match /hadoop jar #{path_pattern}hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars #{path_pattern}.jar -files mapred.rb/
+  end
+  it 'can get mapred args' do
+    client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "inputs", "outputs"])
+    client.mapred_args.should == "--script mapred.rb inputs outputs"
+  end
+  it 'can parse args' do
+    client = JRubyOnHadoop::Client.new(["examples/mapred.rb", "in", "out"])
+    client.script.should == 'mapred.rb'
+    client.inputs.should == 'in'
+    client.outputs.should == 'out'
+    client.files.should include 'examples/mapred.rb'
   end
 end

data/spec/ruby_wrapper_spec.rb ADDED Viewed

@@ -0,0 +1,30 @@
+require 'ruby_wrapper'
+describe 'wrapper' do
+  before do
+    examples_dir = File.expand_path(File.join(File.dirname(__FILE__), '..', 'examples'))
+    $: << examples_dir
+    @script = 'mapred.rb'
+    @output, @repoter = mock('output'), mock('repoter')
+    @key, @value = Text.new, Text.new
+    @key.set('key')
+    @value.set('value')
+  end
+  it 'can wrap setup' do
+    wrap_setup(mock('conf'), @script, nil)
+  end
+  it 'can wrap mapper' do
+    @output.should_receive(:collect).once
+    wrap_map(@key, @value, @output, @reporter, @script, nil)
+  end
+  it 'can wrap reducer' do
+    @output.should_receive(:collect).once
+    values = [1, 2, 3].map {|v| IntWritable.new(v)}.to_java
+    wrap_reduce(@key, values, @output, @reporter, @script, nil)
+  end
+end

data/test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java ADDED Viewed

@@ -0,0 +1,18 @@
+package org.apache.hadoop.ruby;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+public class JRubyJobRunnerTest {
+	@Test
+	public void testRun() throws Exception {
+		JRubyJobRunner runner = new JRubyJobRunner();
+		Configuration conf = new Configuration();
+		runner.setConf(conf);
+		String[] args = { "--script", "mapred.rb", "inputs", "outputs" };
+		try {
+			runner.run(args);
+		} catch (Throwable t) { /* ignore */ }
+	}
+}

data/test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java ADDED Viewed

@@ -0,0 +1,31 @@
+package org.apache.hadoop.ruby.mapred;
+import java.io.IOException;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.junit.Test;
+public class JRubyMapperTest {
+	@Test
+	public void testMap() throws IOException {
+		LongWritable key = new LongWritable(0L);
+		Text value = new Text();
+		value.set("value");
+		JRubyMapper mapper = new JRubyMapper();
+		JobConf conf = new JobConf();
+		conf.set("mapred.ruby.script", "mapred.rb");
+		mapper.configure(conf);
+		try {
+			mapper.map(key, value, null, null);
+		} catch (Throwable t) {
+			// ignore
+			// TODO mock check
+		}
+	}
+}

data/test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java ADDED Viewed

@@ -0,0 +1,36 @@
+package org.apache.hadoop.ruby.mapred;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.junit.Test;
+public class JRubyReducerTest {
+	@Test
+	public void testReduce() throws IOException {
+		Text key = new Text();
+		key.set("key");
+		List<IntWritable> values = new ArrayList<IntWritable>();
+		values.add(new IntWritable(1));
+		values.add(new IntWritable(2));
+		values.add(new IntWritable(3));
+		JRubyReducer reducer = new JRubyReducer();
+		JobConf conf = new JobConf();
+		conf.set("mapred.ruby.script", "mapred.rb");
+		reducer.configure(conf);
+		try {
+			reducer.reduce(key, values.iterator(), null, null);
+		} catch (Throwable t) {
+			// ignore
+			// TODO mock check
+		}
+	}
+}

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: jruby-on-hadoop
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-12-28 00:00:00 +09:00
+date: 2010-01-03 00:00:00 +09:00
 default_executable: joh
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -29,17 +29,24 @@ executables:
 extensions: []
 extra_rdoc_files:
-- README
+- README.rdoc
 files:
-- README
+- README.rdoc
 - Rakefile
 - VERSION
+- bin/joh
 - build.xml
+- examples/wordcount.rb
 - jruby-on-hadoop.gemspec
 - lib/hadoop-ruby.jar
 - lib/jruby-on-hadoop.rb
 - lib/jruby-on-hadoop/client.rb
+- lib/ruby_wrapper.rb
 - spec/jruby-on-hadoop_spec.rb
+- spec/ruby_wrapper_spec.rb
+- test/java/org/apache/hadoop/ruby/JRubyJobRunnerTest.java
+- test/java/org/apache/hadoop/ruby/mapred/JRubyMapperTest.java
+- test/java/org/apache/hadoop/ruby/mapred/JRubyReducerTest.java
 has_rdoc: true
 homepage: http://github.com/fujibee/jruby-on-hadoop
 licenses: []
@@ -70,3 +77,5 @@ specification_version: 3
 summary: JRuby on Hadoop
 test_files:
 - spec/jruby-on-hadoop_spec.rb
+- spec/ruby_wrapper_spec.rb
+- examples/wordcount.rb

data/README DELETED Viewed

@@ -1,30 +0,0 @@
-= hadoop-ruby
-== Description
-HadoopのMapper/ReducerをRubyで記述することができます。
-hadoop-rubydslのためのJRubyラッパーです。
-例）
-init.rb
-def map(script, key, value, output, reporter)
-  # map処理
-end
-def reduce(script, key, values, output, reporter)
-  # reduce処理
-end
-== Build
-ant
-を実行します。
-環境変数HADOOP_HOMEを設定する必要があります。
-想定しているHadoopのバージョンは0.19.2です。
-== Author
-Koichi Fujikawa <fujibee@gmail.com>
-== Copyright
-License: Apache License