RubyGems - hadoop-rubydsl - Versions diffs - 0.0.2 → 0.0.3 - Mend

hadoop-rubydsl 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

data/README.rdoc +58 -0
data/VERSION +1 -1
data/bin/hrd +5 -0
data/hadoop-rubydsl.gemspec +11 -9
data/lib/client.rb +27 -0
data/lib/{init.rb → dsl_init.rb} +4 -22
data/lib/hadoop-dsl.rb +12 -0
data/lib/util.rb +12 -1
data/spec/client_spec.rb +26 -0
data/spec/core_spec.rb +1 -1
data/spec/{init_spec.rb → dsl_init_spec.rb} +6 -13
data/spec/hive_like_spec.rb +0 -1
data/spec/log_analysis_spec.rb +0 -1
data/spec/util_spec.rb +4 -0
data/spec/word_count_spec.rb +0 -1
metadata +11 -8
data/README +0 -53
data/lib/java/.gitignore +0 -1
data/lib/java/hadoop-ruby.jar +0 -0

data/README.rdoc ADDED Viewed

@@ -0,0 +1,58 @@
+= hadoop-rubydsl
+Enable to run Ruby DSL script on your Hadoop.
+== Description
+You can write DSL by Ruby to run Hadoop as Mapper / Reducer.
+This gem depends on 'jruby-on-hadoop' project.
+== Install
+Required gems are all on GemCutter.
+1. Upgrade your rubygem to 1.3.5
+2. Install gems
+ $ gem install hadoop-rubydsl
+== Usage
+1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
+2. put files into your hdfs. ex) wc/inputs/file1
+3. Now you can run 'hrd' like below:
+ $ hrd examples/word_count_test.rb
+You can get Hadoop job results in your hdfs wc/outputs/part-*
+== Examples
+Word Count DSL script
+ use 'WordCount'
+ from 'wc/inputs'
+ to 'wc/outputs'
+ count_uniq
+ total :bytes, :words, :lines
+Log Analysis DSL script
+ use 'LogAnalysis'
+ data 'apache log on test2' do
+   from 'apachelog/inputs'
+   to 'apachelog/outputs'
+   each_line do
+     pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
+     column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
+     topic 'ua counts', :label => 'ua' do
+       count_uniq column[:ua]
+     end
+   end
+ end
+== Author
+Koichi Fujikawa <fujibee@gmail.com>
+== Copyright
+License: Apache License

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.2
1	+ 0.0.3

data/bin/hrd ADDED Viewed

@@ -0,0 +1,5 @@
+#!/usr/bin/env ruby
+require 'hadoop-dsl'
+HadoopDsl::Client.new(ARGV).run

data/hadoop-rubydsl.gemspec CHANGED Viewed

@@ -5,26 +5,27 @@
 Gem::Specification.new do |s|
   s.name = %q{hadoop-rubydsl}
-  s.version = "0.0.2"
+  s.version = "0.0.3"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Koichi Fujikawa"]
-  s.date = %q{2009-12-28}
+  s.date = %q{2010-01-04}
   s.description = %q{Hadoop Ruby DSL}
   s.email = %q{fujibee@gmail.com}
-  s.executables = ["hadoop-hudson.sh", "hadoop-ruby.sh"]
+  s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
   s.extra_rdoc_files = [
-    "README",
+    "README.rdoc",
      "TODO"
   ]
   s.files = [
     ".gitignore",
-     "README",
+     "README.rdoc",
      "Rakefile",
      "TODO",
      "VERSION",
      "bin/hadoop-hudson.sh",
      "bin/hadoop-ruby.sh",
+     "bin/hrd",
      "conf/hadoop-site.xml",
      "examples/apachelog-v2-2.rb",
      "examples/apachelog-v2.rb",
@@ -32,11 +33,11 @@ Gem::Specification.new do |s|
      "examples/hive_like_test.rb",
      "examples/word_count_test.rb",
      "hadoop-rubydsl.gemspec",
+     "lib/client.rb",
      "lib/core.rb",
+     "lib/dsl_init.rb",
+     "lib/hadoop-dsl.rb",
      "lib/hive_like.rb",
-     "lib/init.rb",
-     "lib/java/.gitignore",
-     "lib/java/hadoop-ruby.jar",
      "lib/log_analysis.rb",
      "lib/mapred_factory.rb",
      "lib/util.rb",
@@ -49,14 +50,15 @@ Gem::Specification.new do |s|
   s.summary = %q{Hadoop Ruby DSL}
   s.test_files = [
     "spec/spec_helper.rb",
+     "spec/dsl_init_spec.rb",
      "spec/core_spec.rb",
+     "spec/client_spec.rb",
      "spec/util_spec.rb",
      "spec/mapred_factory_spec.rb",
      "spec/word_count_spec.rb",
      "spec/hive_like_spec.rb",
      "spec/log_analysis_spec.rb",
      "spec/example_spec.rb",
-     "spec/init_spec.rb",
      "examples/apachelog-v2.rb",
      "examples/hive_like_test.rb",
      "examples/word_count_test.rb",

data/lib/client.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module HadoopDsl
+  class Client < JRubyOnHadoop::Client
+    def parse_args
+      super
+      @script_path = HadoopDsl.dsl_init_script
+      @script = File.basename(@script_path)
+      @dsl_file_path = @args[0]
+      @dsl_file = File.basename(@dsl_file_path)
+      @files << @script_path << @dsl_file_path
+      # TODO move properly, with jruby-on-hadoop
+      add_dsl_lib_files
+      ENV['RUBYLIB'] = File.dirname(@dsl_file_path)
+    end
+    def mapred_args
+      args = super
+      args += " --dslfile #{@dsl_file}"
+      args
+    end
+    def add_dsl_lib_files
+      lib_path = HadoopDsl.lib_path
+      @files += Dir.glob(File.join(lib_path, "*.rb"))
+    end
+  end
+end

data/lib/{init.rb → dsl_init.rb} RENAMED Viewed

@@ -12,15 +12,14 @@ HadoopDsl::Text = Text
 HadoopDsl::IntWritable = IntWritable
 def map(key, value, output, reporter, script)
-  mapper = MapperFactory.create(script, key.to_string, value.to_string)
+  mapper = MapperFactory.create(script, key, value)
   mapper.run
   write(output, mapper)
 end
 def reduce(key, values, output, reporter, script)
-  ruby_values = values.map {|v| to_ruby(v)}
-  reducer = ReducerFactory.create(script, key.to_string, ruby_values)
+  reducer = ReducerFactory.create(script, key, values)
   reducer.run
   write(output, reducer)
@@ -29,8 +28,7 @@ end
 def setup(conf, script)
   setup = SetupFactory.create(script, conf)
   setup.run
-  setup.paths.to_java
+  setup.paths
 end
 private
@@ -38,23 +36,7 @@ private
 def write(output, controller)
   controller.emitted.each do |e|
     e.each do |k, v|
-      output.collect(to_hadoop(k), to_hadoop(v))
+      output.collect(k, v)
     end
   end
 end
-def to_ruby(value)
-  case value
-  when IntWritable then value.get
-  when Text then value.to_string
-  else raise "no match class: #{value.class}"
-  end
-end
-def to_hadoop(value)
-  case value
-  when Integer then IntWritable.new(value)
-  when String then t = Text.new; t.set(value); t
-  else raise "no match class: #{value.class}"
-  end
-end

data/lib/hadoop-dsl.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require 'jruby-on-hadoop'
+require 'client'
+module HadoopDsl
+  def self.lib_path
+    File.expand_path(File.dirname(__FILE__))
+  end
+  def self.dsl_init_script
+    File.join(lib_path, "dsl_init.rb")
+  end
+end

data/lib/util.rb CHANGED Viewed

@@ -6,6 +6,17 @@ module HadoopDsl
   end
   def read_file(file_name)
-    File.open(file_name).read
+    # read as usual
+    body = File.open(file_name).read rescue nil
+    return body if body
+    # read from loadpath
+    $:.each do |path|
+      p path
+      body = File.open(File.join(path, file_name)).read rescue next
+      return body if body
+    end
+    raise "cannot find file - #{file_name}"
   end
 end

data/spec/client_spec.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'hadoop-dsl'
+describe HadoopDsl::Client do
+  before do
+    @client = HadoopDsl::Client.new(["examples/wordcount.rb", "in", "out"])
+  end
+  it 'can parse args' do
+    @client.files.join.should match /ruby_wrapper\.rb/
+    @client.files.join.should match /dsl_init\.rb/
+    @client.files.should include 'examples/wordcount.rb'
+    @client.inputs.should == 'in'
+    @client.outputs.should == 'out'
+  end
+  it 'can add dsl file into mapred args' do
+    @client.mapred_args.should ==
+      "--script dsl_init.rb in out --dslfile wordcount.rb"
+  end
+  it 'can add dsl lib files' do
+    lib_path = HadoopDsl.lib_path
+    @client.files.should include File.join(lib_path, 'core.rb')
+    @client.files.should include File.join(lib_path, 'log_analysis.rb')
+  end
+end

data/spec/core_spec.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-require 'init'
+require 'dsl_init'
 require 'core'
 include HadoopDsl

data/spec/{init_spec.rb → dsl_init_spec.rb} RENAMED Viewed

@@ -1,9 +1,4 @@
-require 'java'
-require 'init'
-import 'org.apache.hadoop.io.IntWritable'
-import 'org.apache.hadoop.io.Text'
-import 'org.apache.hadoop.mapred.JobConf'
+require 'dsl_init'
 describe 'mapreduce init' do
@@ -24,22 +19,20 @@ end
   end
   before do
-    @one = IntWritable.new(1)
+    @one = 1
     @output = mock('output')
   end
   it 'can map sucessfully' do
-    key, value  = Text.new, Text.new
-    key.set("key")
-    value.set('it should be fine')
+    key = 'key'
+    value = 'it should be fine'
     @output.should_receive(:collect).once #.with(@text, @one)
     map(key, value, @output, nil, @script)
   end
   it 'can reduce sucessfully' do
-    key, value = Text.new, Text.new
-    key.set("t1\tkey")
+    key = "t1\tkey"
     values = [@one, @one, @one]
     @output.should_receive(:collect).once #.with(@text, @one)
@@ -47,7 +40,7 @@ end
   end
   it 'can set job conf' do
-    conf = JobConf.new
+    conf = mock('jobconf')
     paths = setup(conf, @script)
     paths[0].should == 'test/inputs'

data/spec/hive_like_spec.rb CHANGED Viewed

@@ -1,4 +1,3 @@
-require 'init'
 require 'core'
 require 'hive_like'

data/spec/log_analysis_spec.rb CHANGED Viewed

@@ -1,4 +1,3 @@
-require 'init'
 require 'core'
 require 'log_analysis'

data/spec/util_spec.rb CHANGED Viewed

@@ -12,4 +12,8 @@ describe 'utilities' do
     @script = create_tmp_script(script_body)
     read_file(@script).should == script_body
   end
+  it 'raise error if no file in loadpath' do
+    lambda { read_file('not_exists_on_loadpath') }.should raise_error
+  end
 end

data/spec/word_count_spec.rb CHANGED Viewed

@@ -1,4 +1,3 @@
-require 'init'
 require 'core'
 require 'word_count'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: hadoop-rubydsl
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Koichi Fujikawa
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2009-12-28 00:00:00 +09:00
+date: 2010-01-04 00:00:00 +09:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -25,21 +25,23 @@ dependencies:
 description: Hadoop Ruby DSL
 email: fujibee@gmail.com
 executables:
+- hrd
 - hadoop-hudson.sh
 - hadoop-ruby.sh
 extensions: []
 extra_rdoc_files:
-- README
+- README.rdoc
 - TODO
 files:
 - .gitignore
-- README
+- README.rdoc
 - Rakefile
 - TODO
 - VERSION
 - bin/hadoop-hudson.sh
 - bin/hadoop-ruby.sh
+- bin/hrd
 - conf/hadoop-site.xml
 - examples/apachelog-v2-2.rb
 - examples/apachelog-v2.rb
@@ -47,11 +49,11 @@ files:
 - examples/hive_like_test.rb
 - examples/word_count_test.rb
 - hadoop-rubydsl.gemspec
+- lib/client.rb
 - lib/core.rb
+- lib/dsl_init.rb
+- lib/hadoop-dsl.rb
 - lib/hive_like.rb
-- lib/init.rb
-- lib/java/.gitignore
-- lib/java/hadoop-ruby.jar
 - lib/log_analysis.rb
 - lib/mapred_factory.rb
 - lib/util.rb
@@ -86,14 +88,15 @@ specification_version: 3
 summary: Hadoop Ruby DSL
 test_files:
 - spec/spec_helper.rb
+- spec/dsl_init_spec.rb
 - spec/core_spec.rb
+- spec/client_spec.rb
 - spec/util_spec.rb
 - spec/mapred_factory_spec.rb
 - spec/word_count_spec.rb
 - spec/hive_like_spec.rb
 - spec/log_analysis_spec.rb
 - spec/example_spec.rb
-- spec/init_spec.rb
 - examples/apachelog-v2.rb
 - examples/hive_like_test.rb
 - examples/word_count_test.rb

data/README DELETED Viewed

@@ -1,53 +0,0 @@
-= hadoop-rubydsl
-== Description
-HadoopのMapper/ReducerをRubyによるDSLで記述することができます。
-hadoop-ruby.jarを利用します。
-例）
-apachelog.rb
-# log:
-#   127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
-#   127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
-#   127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
-use 'LogAnalysis'
-data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
-column[2].count_uniq
-column[3].count_uniq
-column[4].count_uniq
-column[5].count_uniq
-column[6].sum
-=>
-col2    frank   1
-col2    frank2  2
-col3    [10/Oct/2000:13:55:36 -0700]    3
-col4    "GET /apache_pb.gif HTTP/1.0"   1
-col4    "GET /apache_pb2.gif HTTP/1.0"  1
-col4    "GET /apache_pb3.gif HTTP/1.0"  1
-col5    200     2
-col5    404     1
-col6    6978
-== Usage
-0. HADOOP_HOMEを正しく設定し、Hadoopを一式立ち上げておく。
-1. jruby-complete-*.jar を lib/java 以下にコピー
-ex)
-$ wget http://jruby.kenai.com/downloads/1.4.0RC2/jruby-complete-1.4.0RC2.jar
-$ cp jruby-complete-*.jar lib/java/
-2. データを HDFS にアップロード
-ex)
-$ hadoop dfs -copyFromLocal apachelog inputs/
-3. MapReduce実行
-$ bin/hadoop-ruby.sh examples/apachelog.rb inputs outputs
-== Author
-Koichi Fujikawa <fujibee@gmail.com>
-== Copyright
-License: Apache License

data/lib/java/.gitignore DELETED Viewed

	@@ -1 +0,0 @@
1	- jruby-complete-*.jar

data/lib/java/hadoop-ruby.jar DELETED Viewed

Binary file