RubyGems - hadoop-jruby-connector - Versions diffs - 0.0.1.2010122601 - Mend

hadoop-jruby-connector 0.0.1.2010122601

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/README +1 -0
data/Rakefile +26 -0
data/VERSION +1 -0
data/lib/hjc.rb +15 -0
data/lib/hjc/fs_shell.rb +36 -0
data/lib/hjc/hadoop_streaming.rb +63 -0
data/lib/hjc/util.rb +21 -0
data/spec/hjc/fs_shell_spec.rb +19 -0
data/spec/hjc/hadoop_streaming_spec.rb +119 -0
data/spec/hjc/util_spec.rb +18 -0
metadata +71 -0

data/README ADDED Viewed

	@@ -0,0 +1 @@
1	+ now developing.. please stay tuned.

data/Rakefile ADDED Viewed

@@ -0,0 +1,26 @@
+require 'rspec/core/rake_task'
+# rspec
+RSpec::Core::RakeTask.new(:spec) do |t|
+  t.rspec_opts = ['-c', '-fs']
+end
+# jeweler
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gemspec|
+    gemspec.name = "hadoop-jruby-connector"
+    gemspec.summary = "Hadoop connector by JRuby"
+    gemspec.description = "Hadoop connector by JRuby"
+    gemspec.email = "fujibee@hapyrus.com"
+    gemspec.homepage = "http://github.com/hapyrus/hadoop-jruby-connector"
+    gemspec.authors = ["Koichi Fujikawa"]
+    gemspec.files.exclude "spec/**/*"
+  end
+  Jeweler::GemcutterTasks.new
+rescue LoadError
+  puts "Jeweler not available. Install it with: gem install jeweler"
+end
+task :default => :spec

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.0.1.2010122601

data/lib/hjc.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'java'
+require 'hjc/util'
+require 'hjc/hadoop_streaming'
+require 'hjc/fs_shell'
+home = ENV['HADOOP_HOME']
+['', 'lib', 'contrib/streaming'].each do |path|
+  Dir.glob(File.join(home, path, "*.jar")) {|jar| require jar }
+end
+$CLASSPATH << "#{home}/conf"
+java_import org.apache.hadoop.streaming.StreamJob
+java_import org.apache.hadoop.fs.FsShell
+java_import org.apache.hadoop.conf.Configuration

data/lib/hjc/fs_shell.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module Hjc
+  class FsShell
+    def initialize
+      @params = []
+    end
+    def put(local, remote)
+      @cmd = :put
+      @params = [local, remote]
+      run
+    end
+    def get(remote, local)
+      @cmd = :get
+      @params = [remote, local]
+      run
+    end
+    def rm(remote)
+      @cmd = :rm
+      @params = [remote]
+      run
+    end
+    def run
+      java_shell = ::FsShell.new(Configuration.new)
+      java_shell.run(build_args)
+    end
+    def build_args
+      concated_args = ["-#{@cmd}"]
+      concated_args.concat @params
+      concated_args
+    end
+  end
+end

data/lib/hjc/hadoop_streaming.rb ADDED Viewed

@@ -0,0 +1,63 @@
+module Hjc
+  class HadoopStreaming
+    attr_accessor :input_path, :output_path, :mapper_path, :reducer_path
+    attr_accessor :local
+    attr_reader :options
+    def initialize
+      @files = {}
+      @options = {}
+      @local = false
+    end
+    def run
+      java_job = StreamJob.new
+      @ret = java_job.run(args)
+    end
+    def success?
+      @ret == 0 # success if job returned 0
+    end
+    def input=(input)
+      # input param seems to explain exact path on Hadoop streaming..
+      file = Util.to_temp_file('input', input)
+      @input_path = file.path
+      unless @local # path seems on HDFS
+        sh = FsShell.new
+        sh.put(file.path, file.path)
+      end
+    end
+    def mapper=(mapper)
+      @files['mapper'] = file = Util.to_temp_file('mapper', mapper, :mod => 0700)
+      @mapper_path = File.basename(file.path)
+    end
+    def reducer=(reducer)
+      @files['reducer'] = file = Util.to_temp_file('reducer', reducer, :mod => 0700)
+      @reducer_path = File.basename(file.path)
+    end
+    def args
+      concated_args = []
+      concated_args.concat ['-input', @input_path] if @input_path
+      concated_args.concat ['-output' ,@output_path] if @output_path
+      concated_args.concat ['-mapper', @mapper_path] if @mapper_path
+      concated_args.concat ['-reducer', @reducer_path] if @reducer_path
+      concated_args.concat ['-dfs', 'file:///'] if @local
+      concated_args.concat ['-jt', 'local'] if @local # no use?
+      @options.each do |k, v|
+        concated_args.concat ["-#{k}", v]
+      end
+      @files.each do |k, v|
+        concated_args.concat ["-file", v.path]
+      end
+      puts "args: #{concated_args.join(' ')}"
+      concated_args
+    end
+  end
+end

data/lib/hjc/util.rb ADDED Viewed

@@ -0,0 +1,21 @@
+require 'tempfile'
+module Hjc
+  class Util
+    TMP_DIR = 'tmp'
+    def self.to_temp_file(filename, body, options={})
+      file = Tempfile.new(filename, TMP_DIR)
+      file.print body
+      file.close
+      unless options.empty?
+        if options.keys.include? :mod
+          File.chmod(options[:mod], file.path)
+        end
+      end
+      file
+    end
+  end
+end

data/spec/hjc/fs_shell_spec.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require 'hjc'
+module Hjc
+  describe FsShell do
+    it 'put and get file to hdfs, and delete' do
+      localfile = Util.to_temp_file('localfile', 'fs shell test')
+      shell = FsShell.new
+      shell.put(localfile.path, 'remotefile')
+      shell.get('remotefile', 'tmp/returnedfile')
+      File.open('tmp/returnedfile').read.should == 'fs shell test'
+      shell.rm('remotefile')
+      File.delete('tmp/returnedfile')
+    end
+  end
+end

data/spec/hjc/hadoop_streaming_spec.rb ADDED Viewed

@@ -0,0 +1,119 @@
+require 'hjc'
+require 'fileutils'
+module Hjc
+  describe HadoopStreaming do
+    TMP_DIR = 'tmp'
+    before :all do
+      @map_script = Util.to_temp_file('map.rb', MAPPER, :mod => 0700)
+      @reduce_script = Util.to_temp_file('reduce.rb', REDUCER, :mod => 0700)
+      @data_file = Util.to_temp_file('testdata', TEST_DATA)
+      @output_dir = TMP_DIR + '/out' + Time.new.to_i.to_s
+    end
+    it 'create args for hadoop streaming' do
+      job = HadoopStreaming.new
+      job.input_path = "input"
+      job.output_path = "outdir"
+      job.mapper_path = "mapper"
+      job.reducer_path = "reducer"
+      job.args.join(" ").should == "-input input -output outdir -mapper mapper -reducer reducer"
+    end
+    it 'create args for hadoop streaming with options' do
+      job = HadoopStreaming.new
+      job.options["dfs"] = "local"
+      job.options["jobconf"] = "mapred.map.tasks=1"
+      job.args.join(" ").should == %!-dfs local -jobconf mapred.map.tasks=1!
+    end
+    it 'create args for hadoop streaming with files' do
+      job = HadoopStreaming.new
+      job.input = 'input'
+      job.mapper = 'mapper'
+      job.reducer = 'reducer'
+      job.args.should include('-file')
+      job.args.join(" ").should match(/input/)
+      job.args.join(" ").should match(/mapper/)
+      job.args.join(" ").should match(/reducer/)
+    end
+    it 'can run Hadoop streaming job with path' do
+      pending 'path does not work'
+      job = HadoopStreaming.new
+      job.input_path = "file://" + File.expand_path(@data_file.path)
+      job.output_path = @output_dir
+      job.mapper_path = @map_script.path
+      job.reducer_path = @reduce_script.path
+      job.local = true
+      job.run
+      assert_result
+      clean_output
+    end
+    it 'can run Hadoop streaming job with string' do
+      job = HadoopStreaming.new
+      job.input = TEST_DATA
+      job.output_path = @output_dir
+      job.mapper = MAPPER
+      job.reducer = REDUCER
+      job.local = true
+      job.run
+      assert_result
+      clean_output
+    end
+    def assert_result
+      File.open(File.join(@output_dir, 'part-00000')) do |f|
+        h = {}
+        f.readlines.each do |line|
+          a = line.split /\t/
+          h[a[0]] = a[1].chomp
+        end
+        h.should == {'bar' => '1', 'foo' => '3', 'fuga' => '2', 'hoge' => '4'}
+      end
+    end
+    def clean_output
+      FileUtils.rm_r([@output_dir])
+    end
+    MAPPER = <<-'EOF'
+#!/usr/bin/env ruby
+ARGF.each do |line|
+  line.chomp!
+  line.split.each do |word|
+    puts "#{word}\t1"
+  end
+end
+    EOF
+    REDUCER = <<-'EOF'
+#!/usr/bin/env ruby
+count = Hash.new {|h,k| h[k] = 0}
+ARGF.each do |line|
+  line.chomp!
+    key, value = line.split(/\t/)
+  count[key] += 1
+end
+count.each do |k,v|
+  puts "#{k}\t#{v}"
+end
+    EOF
+    TEST_DATA = <<-'EOF'
+hoge fuga foo hoge foo
+foo bar hoge hoge fuga
+    EOF
+  end
+end

data/spec/hjc/util_spec.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require 'hjc'
+module Hjc
+  describe 'Hjc::util' do
+    it 'can convert string to file' do
+      f = Util.to_temp_file('map.rb', 'mapscript')
+      FileTest.exist?(f.path).should be_true
+      File.open(f.path).read.should == 'mapscript'
+    end
+    it 'can convert string to file with exec flag' do
+      f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
+      FileTest.executable?(f.path).should be_true
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,71 @@
+--- !ruby/object:Gem::Specification
+name: hadoop-jruby-connector
+version: !ruby/object:Gem::Version
+  prerelease: false
+  segments:
+    - 0
+    - 0
+    - 1
+    - 2010122601
+  version: 0.0.1.2010122601
+platform: ruby
+authors:
+  - Koichi Fujikawa
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-12-26 00:00:00 +09:00
+default_executable:
+dependencies: []
+description: Hadoop connector by JRuby
+email: fujibee@hapyrus.com
+executables: []
+extensions: []
+extra_rdoc_files:
+  - README
+files:
+  - README
+  - Rakefile
+  - VERSION
+  - lib/hjc.rb
+  - lib/hjc/fs_shell.rb
+  - lib/hjc/hadoop_streaming.rb
+  - lib/hjc/util.rb
+has_rdoc: true
+homepage: http://github.com/hapyrus/hadoop-jruby-connector
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+  - lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+          - 0
+        version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+          - 0
+        version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.6
+signing_key:
+specification_version: 3
+summary: Hadoop connector by JRuby
+test_files:
+  - spec/hjc/fs_shell_spec.rb
+  - spec/hjc/hadoop_streaming_spec.rb
+  - spec/hjc/util_spec.rb