RubyGems - rubydoop - Versions diffs - 1.0.0-java - Mend

rubydoop 1.0.0-java

Files changed (7) hide show

data/lib/hadoop.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# encoding: utf-8
+require 'java'
+# @private
+module Hadoop
+  module Io
+    include_package 'org.apache.hadoop.io'
+  end
+  module Mapreduce
+    include_package 'org.apache.hadoop.mapreduce'
+    module Lib
+      include_package 'org.apache.hadoop.mapreduce.lib'
+      module Input
+        include_package 'org.apache.hadoop.mapreduce.lib.input'
+      end
+      module Output
+        include_package 'org.apache.hadoop.mapreduce.lib.output'
+      end
+    end
+  end
+  module Fs
+    include_package 'org.apache.hadoop.fs'
+  end
+end

data/lib/rubydoop.jar ADDED Viewed

Binary file

data/lib/rubydoop.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# encoding: utf-8
+$LOAD_PATH << File.expand_path('..', __FILE__)
+require 'hadoop'
+# See {Rubydoop.configure} for the job configuration DSL documentation,
+# {Package} for the packaging documentation, or the {file:README.md README}
+# for a getting started guide.
+module Rubydoop
+  # @private
+  def self.create_mapper(conf)
+    create_instance(conf.get(MAPPER_KEY))
+  end
+  # @private
+  def self.create_reducer(conf)
+    create_instance(conf.get(REDUCER_KEY))
+  end
+  # @private
+  def self.create_combiner(conf)
+    create_instance(conf.get(COMBINER_KEY))
+  end
+  # @private
+  def self.create_partitioner(conf)
+    create_instance(conf.get(PARTITIONER_KEY))
+  end
+  # @private
+  def self.create_grouping_comparator(conf)
+    create_instance(conf.get(GROUPING_COMPARATOR_KEY))
+  end
+  # @private
+  def self.create_sort_comparator(conf)
+    create_instance(conf.get(SORT_COMPARATOR_KEY))
+  end
+  private
+  MAPPER_KEY = 'rubydoop.mapper'.freeze
+  REDUCER_KEY = 'rubydoop.reducer'.freeze
+  COMBINER_KEY = 'rubydoop.combiner'.freeze
+  PARTITIONER_KEY = 'rubydoop.partitioner'.freeze
+  GROUPING_COMPARATOR_KEY = 'rubydoop.grouping_comparator'.freeze
+  SORT_COMPARATOR_KEY = 'rubydoop.sort_comparator'.freeze
+  def self.create_instance(const_path)
+    cls = const_path.split('::').reduce(Object) { |host, name| host.const_get(name) }
+    cls.new
+  end
+end
+require 'rubydoop/dsl'

data/lib/rubydoop/dsl.rb ADDED Viewed

@@ -0,0 +1,357 @@
+# encoding: utf-8
+module Rubydoop
+  # Main entrypoint into the configuration DSL.
+  #
+  # @example Configuring a job
+  #
+  #   Rubydoop.configure do |*args|
+  #     job 'word_count' do
+  #       input args[0]
+  #       output args[1]
+  #
+  #       mapper WordCount::Mapper
+  #       reducer WordCount::Mapper
+  #
+  #       output_key Hadoop::Io::Text
+  #       output_value Hadoop::Io::IntWritable
+  #     end
+  #   end
+  #
+  # Within a configure block you can specify one or more jobs, the `job`
+  # blocks are run in the context of a {JobDefinition} instance, so look
+  # at that class for documentation about the available properties. The
+  # `configure` block is run within the context of a {ConfigurationDefinition}
+  # instance. The arguments to the `configure` block is the command line
+  # arguments, minus those handled by Hadoop's `ToolRunner`.
+  #
+  # @yieldparam [Array<String>] *arguments The command line arguments
+  #
+  # @note The tool runner will set the global variable `$rubydoop_context`
+  #   to an object that contains references to the necessary Hadoop
+  #   configuration. Unless this global variable is set the configuration
+  #   block is not run (this is a feature, it means that the configuration
+  #   block doesn't run in mappers and reducers).
+  #
+  def self.configure(impl=ConfigurationDefinition, &block)
+    impl.new($rubydoop_context, &block) if $rubydoop_context
+  end
+  # Lower level API for configuring jobs.
+  #
+  # @example Configuring a job
+  #
+  #     cc = ConfigurationDefinition.new
+  #     cc.job 'word_count' do
+  #       # same DSL as shown in the documentation for Rubydoop.configure
+  #     end
+  #
+  class ConfigurationDefinition
+    def initialize(context=$rubydoop_context, &block)
+      @context = context
+      instance_exec(*arguments, &block) if @context && block_given?
+    end
+    def arguments
+      @context.arguments
+    end
+    def job(name, &block)
+      return nil unless @context
+      job = JobDefinition.new(@context, @context.create_job(name))
+      job.instance_exec(&block)
+      job
+    end
+  end
+  # Job configuration DSL.
+  #
+  # `Rubydoop.configure` blocks are run within the context of an instance of
+  # this class. These are the methods available in those blocks.
+  #
+  class JobDefinition
+    # @private
+    def initialize(context, job)
+      @context = context
+      @job = job
+    end
+    # Sets the input paths of the job.
+    #
+    # Calls `setInputFormatClass` on the Hadoop job and uses the static
+    # `setInputPaths` on the input format to set the job's input path.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setInputFormatClass(java.lang.Class) Hadoop's Job#setInputFormatClass
+    #
+    # @param [String, Enumerable] paths The input paths, either a comma separated
+    #   string or an `Enumerable` of strings (which will be joined with a comma).
+    # @param [Hash] options
+    # @option options [JavaClass] :format The input format to use, defaults to `TextInputFormat`
+    def input(paths, options={})
+      paths = paths.join(',') if paths.is_a?(Enumerable)
+      format = options[:format] || Hadoop::Mapreduce::Lib::Input::TextInputFormat
+      format.set_input_paths(@job, paths)
+      @job.set_input_format_class(format)
+    end
+    # Sets the output path of the job.
+    #
+    # Calls `setOutputFormatClass` on the Hadoop job and uses the static
+    # `setOutputPath` on the output format to set the job's output path.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setOutputFormatClass(java.lang.Class) Hadoop's Job#setOutputFormatClass
+    #
+    # @param [String] dir The output path
+    # @param [Hash] options
+    # @option options [JavaClass] :format The output format to use, defaults to `TextOutputFormat`
+    def output(dir, options={})
+      format = options[:format] || Hadoop::Mapreduce::Lib::Output::TextOutputFormat
+      format.set_output_path(@job, Hadoop::Fs::Path.new(dir))
+      @job.set_output_format_class(format)
+    end
+    # Sets a job property.
+    #
+    # Calls `set`/`setBoolean`/`setLong`/`setFloat` on the Hadoop Job's
+    # configuration (exact method depends on the type of the value).
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/conf/Configuration.html#set(java.lang.String,%20java.lang.String) Hadoop's Configuration#set
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/conf/Configuration.html#set(java.lang.String,%20java.lang.String) Hadoop's Configuration#setBoolean
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/conf/Configuration.html#set(java.lang.String,%20java.lang.String) Hadoop's Configuration#setLong
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/conf/Configuration.html#set(java.lang.String,%20java.lang.String) Hadoop's Configuration#setFloat
+    #
+    # @param [String] property The property name
+    # @param [String, Numeric, Boolean] value The property value
+    def set(property, value)
+      case value
+      when Integer
+        @job.configuration.set_long(property, value)
+      when Float
+        @job.configuration.set_float(property, value)
+      when true, false
+        @job.configuration.set_boolean(property, value)
+      else
+        @job.configuration.set(property, value)
+      end
+    end
+    # Sets the mapper class.
+    #
+    # The equivalent of calling `setMapperClass` on a Hadoop job, but instead
+    # of a Java class you pass a Ruby class and Rubydoop will wrap it in a way
+    # that works with Hadoop.
+    #
+    # The class only needs to implement the method `map`, which will be called
+    # exactly like a Java mapper class' `map` method would be called.
+    #
+    # You can optionally implement `setup` and `cleanup`, which mirrors the
+    # methods of the same name in Java mappers.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Mapper.html Hadoop's Mapper
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setMapperClass(java.lang.Class) Hadoop's Job#setMapperClass
+    #
+    # @param [Class] cls The (Ruby) mapper class.
+    def mapper(cls=nil)
+      if cls
+        @job.configuration.set(MAPPER_KEY, cls.name)
+        @job.set_mapper_class(@context.proxy_class(:mapper))
+        @mapper = cls
+      end
+      @mapper
+    end
+    alias_method :mapper=, :mapper
+    # Sets the reducer class.
+    #
+    # The equivalent of calling `setReducerClass` on a Hadoop job, but instead
+    # of a Java class you pass a Ruby class and Rubydoop will wrap it in a way
+    # that works with Hadoop.
+    #
+    # The class only needs to implement the method `reduce`, which will be called
+    # exactly like a Java reducer class' `reduce` method would be called.
+    #
+    # You can optionally implement `setup` and `cleanup`, which mirrors the
+    # methods of the same name in Java reducers.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Reducer.html Hadoop's Reducer
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setReducerClass(java.lang.Class) Hadoop's Job#setReducerClass
+    #
+    # @param [Class] cls The (Ruby) reducer class.
+    def reducer(cls=nil)
+      if cls
+        @job.configuration.set(REDUCER_KEY, cls.name)
+        @job.set_reducer_class(@context.proxy_class(:reducer))
+        @reducer = cls
+      end
+      @reducer
+    end
+    alias_method :reducer=, :reducer
+    # Sets the combiner class.
+    #
+    # The equivalent of calling `setCombinerClass` on a Hadoop job, but instead
+    # of a Java class you pass a Ruby class and Rubydoop will wrap it in a way
+    # that works with Hadoop.
+    #
+    # A combiner should implement `reduce`, just like reducers.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setCombinerClass(java.lang.Class) Hadoop's Job#setCombinerClass
+    #
+    # @param [Class] cls The (Ruby) combiner class.
+    def combiner(cls=nil)
+      if cls
+        @job.configuration.set(COMBINER_KEY, cls.name)
+        @job.set_combiner_class(@context.proxy_class(:combiner))
+        @combiner = cls
+      end
+      @combiner
+    end
+    alias_method :combiner=, :combiner
+    # Sets a custom partitioner.
+    #
+    # The equivalent of calling `setPartitionerClass` on a Hadoop job, but instead
+    # of a Java class you pass a Ruby class and Rubydoop will wrap it in a way
+    # that works with Hadoop.
+    #
+    # The class must implement `partition`, which will be called exactly like
+    # a Java partitioner would.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setPartitionerClass(java.lang.Class) Hadoop's Job#setPartitionerClass
+    #
+    # @param [Class] cls The (Ruby) partitioner class.
+    def partitioner(cls=nil)
+      if cls
+        @job.configuration.set(PARTITIONER_KEY, cls.name)
+        @job.set_partitioner_class(@context.proxy_class(:partitioner))
+        @partitioner = cls
+      end
+      @partitioner
+    end
+    alias_method :partitioner=, :partitioner
+    # Sets a custom grouping comparator.
+    #
+    # The equivalent of calling `setGroupingComparatorClass` on a Hadoop job,
+    # but instead of a Java class you pass a Ruby class and Rubydoop will wrap
+    # it in a way that works with Hadoop.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setGroupingComparatorClass(java.lang.Class) Hadoop's Job#setGroupingComparatorClass
+    #
+    # @param [Class] cls The (Ruby) comparator class.
+    def grouping_comparator(cls=nil)
+      if cls
+        @job.configuration.set(GROUPING_COMPARATOR_KEY, cls.name)
+        @job.set_grouping_comparator_class(@context.proxy_class(:grouping_comparator))
+        @grouping_comparator = cls
+      end
+      @grouping_comparator
+    end
+    alias_method :grouping_comparator=, :grouping_comparator
+    # Sets a custom sort comparator.
+    #
+    # The equivalent of calling `setSortComparatorClass` on a Hadoop job,
+    # but instead of a Java class you pass a Ruby class and Rubydoop will wrap
+    # it in a way that works with Hadoop.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setSortComparatorClass(java.lang.Class) Hadoop's Job#setSortComparatorClass
+    #
+    # @param [Class] cls The (Ruby) comparator class.
+    def sort_comparator(cls=nil)
+      if cls
+        @job.configuration.set(SORT_COMPARATOR_KEY, cls.name)
+        @job.set_sort_comparator_class(@context.proxy_class(:sort_comparator))
+        @sort_comparator = cls
+      end
+      @sort_comparator
+    end
+    alias_method :sort_comparator=, :sort_comparator
+    # If you need to manipulate the Hadoop job in some that isn't covered by
+    # this DSL, this is the method for you. It yields the `Job`, letting you
+    # do whatever you want with it.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html Hadoop's Job
+    #
+    # @yieldparam [Hadoop::Mapreduce::Job] job The raw Hadoop Job instance
+    def raw(&block)
+      yield @job
+    end
+    private
+    def self.class_setter(dsl_name)
+      define_method(dsl_name) do |cls|
+        if cls
+          @job.send("set_#{dsl_name}_class", cls.java_class)
+          instance_variable_set(:"@#{dsl_name}", cls)
+        end
+        instance_variable_get(:"@#{dsl_name}")
+      end
+      define_method("#{dsl_name}=") do |cls|
+        @job.send("set_#{dsl_name}_class", cls.java_class)
+      end
+    end
+    public
+    # @!method map_output_key(cls)
+    #
+    # Sets the mapper's output key type.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setMapOutputKeyClass(java.lang.Class) Hadoop's Job#setMapOutputKeyClass
+    #
+    # @param [Class] cls The mapper's output key type
+    class_setter :map_output_key
+    # @!method map_output_value(cls)
+    #
+    # Sets the mapper's output value type.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setMapOutputValueClass(java.lang.Class) Hadoop's Job#setMapOutputValueClass
+    #
+    # @param [Class] cls The mapper's output value type
+    class_setter :map_output_value
+    # @!method output_key(cls)
+    #
+    # Sets the reducer's output key type.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setOutputKeyClass(java.lang.Class) Hadoop's Job#setOutputKeyClass
+    #
+    # @param [Class] cls The reducer's output key type
+    class_setter :output_key
+    # @!method map_output_value(cls)
+    #
+    # Sets the reducer's output value type.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setOutputValueClass(java.lang.Class) Job#setOutputValueClass
+    #
+    # @param [Class] cls The reducer's output value type
+    class_setter :output_value
+  end
+  # @private
+  class Context
+    attr_reader :jobs, :arguments
+    def initialize(conf, proxy_classes, arguments)
+      @conf = conf
+      @proxy_classes = proxy_classes
+      @arguments = arguments
+      @jobs = []
+    end
+    def create_job(name)
+      hadoop_job = Hadoop::Mapreduce::Job.new(@conf, name)
+      @jobs << hadoop_job
+      hadoop_job
+    end
+    def proxy_class(type)
+      @proxy_classes[type]
+    end
+  end
+end

data/lib/rubydoop/package.rb ADDED Viewed

@@ -0,0 +1,123 @@
+# encoding: utf-8
+require 'bundler'
+require 'open-uri'
+require 'ant'
+require 'fileutils'
+require 'set'
+module Rubydoop
+  # Utility for making a job JAR that works with Hadoop.
+  #
+  # @example Easy to use from Rake
+  #     task :package do
+  #       Rudoop::Package.create!
+  #     end
+  class Package
+    # A package has sane defaults that works in most situations, but almost
+    # everything can be changed.
+    #
+    # If you have extra JAR files that you need to make available for your job
+    # you can specify them with the `:lib_jars` option.
+    #
+    # @param [Hash] options
+    # @option options [String]        :project_base_dir The project's base dir, defaults to the current directory (the assumption is that Package will be used from a Rake task)
+    # @option options [String]        :project_name     The name of the JAR file (minus .jar), defaults to the directory name of the `:project_base_dir`
+    # @option options [String]        :build_dir        The directory to put the final JAR into, defaults to `:project_base_dir + '/build'`
+    # @option options [Array<String>] :gem_groups       All gems from these Gemfile groups will be included, defaults to `[:default]` (the top-level group of a Gemfile)
+    # @option options [Array<String>] :lib_jars         Paths to extra JAR files to include in the JAR's lib directory (where they will be on the classpath when the job is run)
+    # @option options [String]        :jruby_version    The JRuby version to package, defaults to `JRUBY_VERSION`
+    # @option options [String]        :jruby_jar_path   The path to a local copy of `jruby-complete.jar`, defaults to downloading and caching a version defined by `:jruby_version`
+    def initialize(options={})
+      @options = default_options.merge(options)
+      @options[:project_name] = File.basename(@options[:project_base_dir]) unless @options[:project_name]
+      @options[:build_dir] = File.join(@options[:project_base_dir], 'build') unless @options[:build_dir]
+      @options[:jruby_jar_path] = File.join(@options[:build_dir], "jruby-complete-#{@options[:jruby_version]}.jar") unless @options[:jruby_jar_path]
+    end
+    # Create the JAR package, see {Package#initialize} for configuration options.
+    #
+    # On the first run a complete JRuby runtime JAR will be downloaded
+    # (`jruby-complete.jar`) and locally cached, but if you already have a
+    # copy in a local Ivy or Maven repository that will be used instead.
+    def create!
+      create_directories!
+      fetch_jruby!
+      build_jar!
+    end
+    # A shortcut for `Package.new(options).create!`.
+    def self.create!(options={})
+      new(options).create!
+    end
+    private
+    def default_options
+      defaults = {
+        :main_class => 'rubydoop.RubydoopJobRunner',
+        :rubydoop_base_dir => File.expand_path('../../..', __FILE__),
+        :project_base_dir => Dir.getwd,
+        :gem_groups => [:default],
+        :lib_jars => [],
+        :jruby_version => JRUBY_VERSION
+      }
+    end
+    def create_directories!
+      FileUtils.mkdir_p(@options[:build_dir])
+    end
+    def fetch_jruby!
+      return if File.exists?(@options[:jruby_jar_path])
+      local_maven_path = File.expand_path("~/.m2/repository/org/jruby/jruby-complete/#{@options[:jruby_version]}/jruby-complete-#{@options[:jruby_version]}.jar")
+      local_ivy_path = File.expand_path("~/.ivy2/cache/org.jruby/jruby-complete/jars/jruby-complete-#{@options[:jruby_version]}.jar")
+      remote_maven_url = "http://central.maven.org/maven2/org/jruby/jruby-complete/#{@options[:jruby_version]}/jruby-complete-#{@options[:jruby_version]}.jar"
+      if File.exists?(local_maven_path)
+        $stderr.puts("Using #{File.basename(local_maven_path)} from local Maven cache")
+        @options[:jruby_jar_path] = local_maven_path
+      elsif File.exists?(local_ivy_path)
+        $stderr.puts("Using #{File.basename(local_maven_path)} from local Ivy2 cache")
+        @options[:jruby_jar_path] = local_ivy_path
+      else
+        $stderr.puts("Downloading #{remote_maven_url} to #{@options[:jruby_jar_path]}")
+        jruby_complete_bytes = open(remote_maven_url).read
+        File.open(@options[:jruby_jar_path], 'wb') do |io|
+          io.write(jruby_complete_bytes)
+        end
+      end
+    end
+    def build_jar!
+      # the ant block is instance_exec'ed so instance variables and methods are not in scope
+      options = @options
+      bundled_gems = load_path
+      lib_jars = [options[:jruby_jar_path], *options[:lib_jars]]
+      ant do
+        jar :destfile => "#{options[:build_dir]}/#{options[:project_name]}.jar" do
+          manifest { attribute :name => 'Main-Class', :value => options[:main_class] }
+          zipfileset :src => "#{options[:rubydoop_base_dir]}/lib/rubydoop.jar"
+          fileset :dir => "#{options[:rubydoop_base_dir]}/lib", :includes => '**/*.rb', :excludes => '*.jar'
+          fileset :dir => "#{options[:project_base_dir]}/lib"
+          bundled_gems.each { |path| fileset :dir => path }
+          lib_jars.each { |extra_jar| zipfileset :dir => File.dirname(extra_jar), :includes => File.basename(extra_jar), :prefix => 'lib' }
+        end
+      end
+    end
+    def load_path
+      Bundler.definition.specs_for(@options[:gem_groups]).flat_map do |spec|
+        if spec.full_name !~ /^(?:bundler|rubydoop)-\d+/
+          spec.require_paths.map do |rp|
+            "#{spec.full_gem_path}/#{rp}"
+          end
+        else
+          []
+        end
+      end
+    end
+  end
+end

data/lib/rubydoop/version.rb ADDED Viewed

@@ -0,0 +1,4 @@
+module Rubydoop
+  # @private
+  VERSION = '1.0.0'
+end

metadata ADDED Viewed

@@ -0,0 +1,53 @@
+--- !ruby/object:Gem::Specification
+name: rubydoop
+version: !ruby/object:Gem::Version
+  prerelease:
+  version: 1.0.0
+platform: java
+authors:
+- Theo Hultberg
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-10-01 00:00:00.000000000Z
+dependencies: []
+description: Rubydoop embeds a JRuby runtime in Hadoop, letting you write map reduce code in Ruby without using the streaming APIs
+email:
+- theo@iconara.net
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/hadoop.rb
+- lib/rubydoop.rb
+- lib/rubydoop/dsl.rb
+- lib/rubydoop/package.rb
+- lib/rubydoop/version.rb
+- lib/rubydoop.jar
+homepage: http://github.com/iconara/rubydoop
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+  none: false
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+  none: false
+requirements: []
+rubyforge_project: rubydoop
+rubygems_version: 1.8.15
+signing_key:
+specification_version: 3
+summary: Write Hadoop jobs in Ruby
+test_files: []
+has_rdoc:
+...