RubyGems - rubydoop - Versions diffs - 1.0.0-java - Mend

rubydoop 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/lib/hadoop.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# encoding: utf-8
+require 'java'
+# @private
+module Hadoop
+  module Io
+    include_package 'org.apache.hadoop.io'
+  end
+  module Mapreduce
+    include_package 'org.apache.hadoop.mapreduce'
+    module Lib
+      include_package 'org.apache.hadoop.mapreduce.lib'
+      module Input
+        include_package 'org.apache.hadoop.mapreduce.lib.input'
+      end
+      module Output
+        include_package 'org.apache.hadoop.mapreduce.lib.output'
+      end
+    end
+  end
+  module Fs
+    include_package 'org.apache.hadoop.fs'
+  end
+end

data/lib/rubydoop.jar ADDED Viewed

Binary file

data/lib/rubydoop.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# encoding: utf-8
+$LOAD_PATH << File.expand_path('..', __FILE__)
+require 'hadoop'
+# See {Rubydoop.configure} for the job configuration DSL documentation,
+# {Package} for the packaging documentation, or the {file:README.md README}
+# for a getting started guide.
+module Rubydoop
+  # @private
+  def self.create_mapper(conf)
+    create_instance(conf.get(MAPPER_KEY))
+  end
+  # @private
+  def self.create_reducer(conf)
+    create_instance(conf.get(REDUCER_KEY))
+  end
+  # @private
+  def self.create_combiner(conf)
+    create_instance(conf.get(COMBINER_KEY))
+  end
+  # @private
+  def self.create_partitioner(conf)
+    create_instance(conf.get(PARTITIONER_KEY))
+  end
+  # @private
+  def self.create_grouping_comparator(conf)
+    create_instance(conf.get(GROUPING_COMPARATOR_KEY))
+  end
+  # @private
+  def self.create_sort_comparator(conf)
+    create_instance(conf.get(SORT_COMPARATOR_KEY))
+  end
+  private
+  MAPPER_KEY = 'rubydoop.mapper'.freeze
+  REDUCER_KEY = 'rubydoop.reducer'.freeze
+  COMBINER_KEY = 'rubydoop.combiner'.freeze
+  PARTITIONER_KEY = 'rubydoop.partitioner'.freeze
+  GROUPING_COMPARATOR_KEY = 'rubydoop.grouping_comparator'.freeze
+  SORT_COMPARATOR_KEY = 'rubydoop.sort_comparator'.freeze
+  def self.create_instance(const_path)
+    cls = const_path.split('::').reduce(Object) { |host, name| host.const_get(name) }
+    cls.new
+  end
+end
+require 'rubydoop/dsl'

data/lib/rubydoop/dsl.rb ADDED Viewed

@@ -0,0 +1,357 @@
+# encoding: utf-8
+module Rubydoop
+  # Main entrypoint into the configuration DSL.
+  #
+  # @example Configuring a job
+  #
+  #   Rubydoop.configure do |*args|
+  #     job 'word_count' do
+  #       input args[0]
+  #       output args[1]
+  #
+  #       mapper WordCount::Mapper
+  #       reducer WordCount::Mapper
+  #
+  #       output_key Hadoop::Io::Text
+  #       output_value Hadoop::Io::IntWritable
+  #     end
+  #   end
+  #
+  # Within a configure block you can specify one or more jobs, the `job`
+  # blocks are run in the context of a {JobDefinition} instance, so look
+  # at that class for documentation about the available properties. The
+  # `configure` block is run within the context of a {ConfigurationDefinition}
+  # instance. The arguments to the `configure` block is the command line
+  # arguments, minus those handled by Hadoop's `ToolRunner`.
+  #
+  # @yieldparam [Array<String>] *arguments The command line arguments
+  #
+  # @note The tool runner will set the global variable `$rubydoop_context`
+  #   to an object that contains references to the necessary Hadoop
+  #   configuration. Unless this global variable is set the configuration
+  #   block is not run (this is a feature, it means that the configuration
+  #   block doesn't run in mappers and reducers).
+  #
+  def self.configure(impl=ConfigurationDefinition, &block)
+    impl.new($rubydoop_context, &block) if $rubydoop_context
+  end
+  # Lower level API for configuring jobs.
+  #
+  # @example Configuring a job
+  #
+  #     cc = ConfigurationDefinition.new
+  #     cc.job 'word_count' do
+  #       # same DSL as shown in the documentation for Rubydoop.configure
+  #     end
+  #
+  class ConfigurationDefinition
+    def initialize(context=$rubydoop_context, &block)
+      @context = context
+      instance_exec(*arguments, &block) if @context && block_given?
+    end
+    def arguments
+      @context.arguments
+    end
+    def job(name, &block)
+      return nil unless @context
+      job = JobDefinition.new(@context, @context.create_job(name))
+      job.instance_exec(&block)
+      job
+    end
+  end
+  # Job configuration DSL.
+  #
+  # `Rubydoop.configure` blocks are run within the context of an instance of
+  # this class. These are the methods available in those blocks.
+  #
+  class JobDefinition
+    # @private
+    def initialize(context, job)
+      @context = context
+      @job = job
+    end
+    # Sets the input paths of the job.
+    #
+    # Calls `setInputFormatClass` on the Hadoop job and uses the static
+    # `setInputPaths` on the input format to set the job's input path.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setInputFormatClass(java.lang.Class) Hadoop's Job#setInputFormatClass
+    #
+    # @param [String, Enumerable] paths The input paths, either a comma separated
+    #   string or an `Enumerable` of strings (which will be joined with a comma).
+    # @param [Hash] options
+    # @option options [JavaClass] :format The input format to use, defaults to `TextInputFormat`
+    def input(paths, options={})
+      paths = paths.join(',') if paths.is_a?(Enumerable)
+      format = options[:format] || Hadoop::Mapreduce::Lib::Input::TextInputFormat
+      format.set_input_paths(@job, paths)
+      @job.set_input_format_class(format)
+    end
+    # Sets the output path of the job.
+    #
+    # Calls `setOutputFormatClass` on the Hadoop job and uses the static
+    # `setOutputPath` on the output format to set the job's output path.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setOutputFormatClass(java.lang.Class) Hadoop's Job#setOutputFormatClass
+    #
+    # @param [String] dir The output path
+    # @param [Hash] options
+    # @option options [JavaClass] :format The output format to use, defaults to `TextOutputFormat`
+    def output(dir, options={})
+      format = options[:format] || Hadoop::Mapreduce::Lib::Output::TextOutputFormat
+      format.set_output_path(@job, Hadoop::Fs::Path.new(dir))
+      @job.set_output_format_class(format)
+    end
+    # Sets a job property.
+    #
+    # Calls `set`/`setBoolean`/`setLong`/`setFloat` on the Hadoop Job's
+    # configuration (exact method depends on the type of the value).
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/conf/Configuration.html#set(java.lang.String,%20java.lang.String) Hadoop's Configuration#set
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/conf/Configuration.html#set(java.lang.String,%20java.lang.String) Hadoop's Configuration#setBoolean
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/conf/Configuration.html#set(java.lang.String,%20java.lang.String) Hadoop's Configuration#setLong
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/conf/Configuration.html#set(java.lang.String,%20java.lang.String) Hadoop's Configuration#setFloat
+    #
+    # @param [String] property The property name
+    # @param [String, Numeric, Boolean] value The property value
+    def set(property, value)
+      case value
+      when Integer
+        @job.configuration.set_long(property, value)
+      when Float
+        @job.configuration.set_float(property, value)
+      when true, false
+        @job.configuration.set_boolean(property, value)
+      else
+        @job.configuration.set(property, value)
+      end
+    end
+    # Sets the mapper class.
+    #
+    # The equivalent of calling `setMapperClass` on a Hadoop job, but instead
+    # of a Java class you pass a Ruby class and Rubydoop will wrap it in a way
+    # that works with Hadoop.
+    #
+    # The class only needs to implement the method `map`, which will be called
+    # exactly like a Java mapper class' `map` method would be called.
+    #
+    # You can optionally implement `setup` and `cleanup`, which mirrors the
+    # methods of the same name in Java mappers.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Mapper.html Hadoop's Mapper
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setMapperClass(java.lang.Class) Hadoop's Job#setMapperClass
+    #
+    # @param [Class] cls The (Ruby) mapper class.
+    def mapper(cls=nil)
+      if cls
+        @job.configuration.set(MAPPER_KEY, cls.name)
+        @job.set_mapper_class(@context.proxy_class(:mapper))
+        @mapper = cls
+      end
+      @mapper
+    end
+    alias_method :mapper=, :mapper
+    # Sets the reducer class.
+    #
+    # The equivalent of calling `setReducerClass` on a Hadoop job, but instead
+    # of a Java class you pass a Ruby class and Rubydoop will wrap it in a way
+    # that works with Hadoop.
+    #
+    # The class only needs to implement the method `reduce`, which will be called
+    # exactly like a Java reducer class' `reduce` method would be called.
+    #
+    # You can optionally implement `setup` and `cleanup`, which mirrors the
+    # methods of the same name in Java reducers.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Reducer.html Hadoop's Reducer
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setReducerClass(java.lang.Class) Hadoop's Job#setReducerClass
+    #
+    # @param [Class] cls The (Ruby) reducer class.
+    def reducer(cls=nil)
+      if cls
+        @job.configuration.set(REDUCER_KEY, cls.name)
+        @job.set_reducer_class(@context.proxy_class(:reducer))
+        @reducer = cls
+      end
+      @reducer
+    end
+    alias_method :reducer=, :reducer
+    # Sets the combiner class.
+    #
+    # The equivalent of calling `setCombinerClass` on a Hadoop job, but instead
+    # of a Java class you pass a Ruby class and Rubydoop will wrap it in a way
+    # that works with Hadoop.
+    #
+    # A combiner should implement `reduce`, just like reducers.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setCombinerClass(java.lang.Class) Hadoop's Job#setCombinerClass
+    #
+    # @param [Class] cls The (Ruby) combiner class.
+    def combiner(cls=nil)
+      if cls
+        @job.configuration.set(COMBINER_KEY, cls.name)
+        @job.set_combiner_class(@context.proxy_class(:combiner))
+        @combiner = cls
+      end
+      @combiner
+    end
+    alias_method :combiner=, :combiner
+    # Sets a custom partitioner.
+    #
+    # The equivalent of calling `setPartitionerClass` on a Hadoop job, but instead
+    # of a Java class you pass a Ruby class and Rubydoop will wrap it in a way
+    # that works with Hadoop.
+    #
+    # The class must implement `partition`, which will be called exactly like
+    # a Java partitioner would.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setPartitionerClass(java.lang.Class) Hadoop's Job#setPartitionerClass
+    #
+    # @param [Class] cls The (Ruby) partitioner class.
+    def partitioner(cls=nil)
+      if cls
+        @job.configuration.set(PARTITIONER_KEY, cls.name)
+        @job.set_partitioner_class(@context.proxy_class(:partitioner))
+        @partitioner = cls
+      end
+      @partitioner
+    end
+    alias_method :partitioner=, :partitioner
+    # Sets a custom grouping comparator.
+    #
+    # The equivalent of calling `setGroupingComparatorClass` on a Hadoop job,
+    # but instead of a Java class you pass a Ruby class and Rubydoop will wrap
+    # it in a way that works with Hadoop.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setGroupingComparatorClass(java.lang.Class) Hadoop's Job#setGroupingComparatorClass
+    #
+    # @param [Class] cls The (Ruby) comparator class.
+    def grouping_comparator(cls=nil)
+      if cls
+        @job.configuration.set(GROUPING_COMPARATOR_KEY, cls.name)
+        @job.set_grouping_comparator_class(@context.proxy_class(:grouping_comparator))
+        @grouping_comparator = cls
+      end
+      @grouping_comparator
+    end
+    alias_method :grouping_comparator=, :grouping_comparator
+    # Sets a custom sort comparator.
+    #
+    # The equivalent of calling `setSortComparatorClass` on a Hadoop job,
+    # but instead of a Java class you pass a Ruby class and Rubydoop will wrap
+    # it in a way that works with Hadoop.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setSortComparatorClass(java.lang.Class) Hadoop's Job#setSortComparatorClass
+    #
+    # @param [Class] cls The (Ruby) comparator class.
+    def sort_comparator(cls=nil)
+      if cls
+        @job.configuration.set(SORT_COMPARATOR_KEY, cls.name)
+        @job.set_sort_comparator_class(@context.proxy_class(:sort_comparator))
+        @sort_comparator = cls
+      end
+      @sort_comparator
+    end
+    alias_method :sort_comparator=, :sort_comparator
+    # If you need to manipulate the Hadoop job in some that isn't covered by
+    # this DSL, this is the method for you. It yields the `Job`, letting you
+    # do whatever you want with it.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html Hadoop's Job
+    #
+    # @yieldparam [Hadoop::Mapreduce::Job] job The raw Hadoop Job instance
+    def raw(&block)
+      yield @job
+    end
+    private
+    def self.class_setter(dsl_name)
+      define_method(dsl_name) do |cls|
+        if cls
+          @job.send("set_#{dsl_name}_class", cls.java_class)
+          instance_variable_set(:"@#{dsl_name}", cls)
+        end
+        instance_variable_get(:"@#{dsl_name}")
+      end
+      define_method("#{dsl_name}=") do |cls|
+        @job.send("set_#{dsl_name}_class", cls.java_class)
+      end
+    end
+    public
+    # @!method map_output_key(cls)
+    #
+    # Sets the mapper's output key type.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setMapOutputKeyClass(java.lang.Class) Hadoop's Job#setMapOutputKeyClass
+    #
+    # @param [Class] cls The mapper's output key type
+    class_setter :map_output_key
+    # @!method map_output_value(cls)
+    #
+    # Sets the mapper's output value type.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setMapOutputValueClass(java.lang.Class) Hadoop's Job#setMapOutputValueClass
+    #
+    # @param [Class] cls The mapper's output value type
+    class_setter :map_output_value
+    # @!method output_key(cls)
+    #
+    # Sets the reducer's output key type.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setOutputKeyClass(java.lang.Class) Hadoop's Job#setOutputKeyClass
+    #
+    # @param [Class] cls The reducer's output key type
+    class_setter :output_key
+    # @!method map_output_value(cls)
+    #
+    # Sets the reducer's output value type.
+    #
+    # @see http://hadoop.apache.org/docs/r1.0.3/api/org/apache/hadoop/mapreduce/Job.html#setOutputValueClass(java.lang.Class) Job#setOutputValueClass
+    #
+    # @param [Class] cls The reducer's output value type
+    class_setter :output_value
+  end
+  # @private
+  class Context
+    attr_reader :jobs, :arguments
+    def initialize(conf, proxy_classes, arguments)
+      @conf = conf
+      @proxy_classes = proxy_classes
+      @arguments = arguments
+      @jobs = []
+    end
+    def create_job(name)
+      hadoop_job = Hadoop::Mapreduce::Job.new(@conf, name)
+      @jobs << hadoop_job
+      hadoop_job
+    end
+    def proxy_class(type)
+      @proxy_classes[type]
+    end
+  end
+end

data/lib/rubydoop/package.rb ADDED Viewed

@@ -0,0 +1,123 @@
+# encoding: utf-8
+require 'bundler'
+require 'open-uri'
+require 'ant'
+require 'fileutils'
+require 'set'
+module Rubydoop
+  # Utility for making a job JAR that works with Hadoop.
+  #
+  # @example Easy to use from Rake
+  #     task :package do
+  #       Rudoop::Package.create!
+  #     end
+  class Package
+    # A package has sane defaults that works in most situations, but almost
+    # everything can be changed.
+    #
+    # If you have extra JAR files that you need to make available for your job
+    # you can specify them with the `:lib_jars` option.
+    #
+    # @param [Hash] options
+    # @option options [String]        :project_base_dir The project's base dir, defaults to the current directory (the assumption is that Package will be used from a Rake task)
+    # @option options [String]        :project_name     The name of the JAR file (minus .jar), defaults to the directory name of the `:project_base_dir`
+    # @option options [String]        :build_dir        The directory to put the final JAR into, defaults to `:project_base_dir + '/build'`
+    # @option options [Array<String>] :gem_groups       All gems from these Gemfile groups will be included, defaults to `[:default]` (the top-level group of a Gemfile)
+    # @option options [Array<String>] :lib_jars         Paths to extra JAR files to include in the JAR's lib directory (where they will be on the classpath when the job is run)
+    # @option options [String]        :jruby_version    The JRuby version to package, defaults to `JRUBY_VERSION`
+    # @option options [String]        :jruby_jar_path   The path to a local copy of `jruby-complete.jar`, defaults to downloading and caching a version defined by `:jruby_version`
+    def initialize(options={})
+      @options = default_options.merge(options)
+      @options[:project_name] = File.basename(@options[:project_base_dir]) unless @options[:project_name]
+      @options[:build_dir] = File.join(@options[:project_base_dir], 'build') unless @options[:build_dir]
+      @options[:jruby_jar_path] = File.join(@options[:build_dir], "jruby-complete-#{@options[:jruby_version]}.jar") unless @options[:jruby_jar_path]
+    end
+    # Create the JAR package, see {Package#initialize} for configuration options.
+    #
+    # On the first run a complete JRuby runtime JAR will be downloaded
+    # (`jruby-complete.jar`) and locally cached, but if you already have a
+    # copy in a local Ivy or Maven repository that will be used instead.
+    def create!
+      create_directories!
+      fetch_jruby!
+      build_jar!
+    end
+    # A shortcut for `Package.new(options).create!`.
+    def self.create!(options={})
+      new(options).create!
+    end
+    private
+    def default_options
+      defaults = {
+        :main_class => 'rubydoop.RubydoopJobRunner',
+        :rubydoop_base_dir => File.expand_path('../../..', __FILE__),
+        :project_base_dir => Dir.getwd,
+        :gem_groups => [:default],
+        :lib_jars => [],
+        :jruby_version => JRUBY_VERSION
+      }
+    end
+    def create_directories!
+      FileUtils.mkdir_p(@options[:build_dir])
+    end
+    def fetch_jruby!
+      return if File.exists?(@options[:jruby_jar_path])
+      local_maven_path = File.expand_path("~/.m2/repository/org/jruby/jruby-complete/#{@options[:jruby_version]}/jruby-complete-#{@options[:jruby_version]}.jar")
+      local_ivy_path = File.expand_path("~/.ivy2/cache/org.jruby/jruby-complete/jars/jruby-complete-#{@options[:jruby_version]}.jar")
+      remote_maven_url = "http://central.maven.org/maven2/org/jruby/jruby-complete/#{@options[:jruby_version]}/jruby-complete-#{@options[:jruby_version]}.jar"
+      if File.exists?(local_maven_path)
+        $stderr.puts("Using #{File.basename(local_maven_path)} from local Maven cache")
+        @options[:jruby_jar_path] = local_maven_path
+      elsif File.exists?(local_ivy_path)
+        $stderr.puts("Using #{File.basename(local_maven_path)} from local Ivy2 cache")
+        @options[:jruby_jar_path] = local_ivy_path
+      else
+        $stderr.puts("Downloading #{remote_maven_url} to #{@options[:jruby_jar_path]}")
+        jruby_complete_bytes = open(remote_maven_url).read
+        File.open(@options[:jruby_jar_path], 'wb') do |io|
+          io.write(jruby_complete_bytes)
+        end
+      end
+    end
+    def build_jar!
+      # the ant block is instance_exec'ed so instance variables and methods are not in scope
+      options = @options
+      bundled_gems = load_path
+      lib_jars = [options[:jruby_jar_path], *options[:lib_jars]]
+      ant do
+        jar :destfile => "#{options[:build_dir]}/#{options[:project_name]}.jar" do
+          manifest { attribute :name => 'Main-Class', :value => options[:main_class] }
+          zipfileset :src => "#{options[:rubydoop_base_dir]}/lib/rubydoop.jar"
+          fileset :dir => "#{options[:rubydoop_base_dir]}/lib", :includes => '**/*.rb', :excludes => '*.jar'
+          fileset :dir => "#{options[:project_base_dir]}/lib"
+          bundled_gems.each { |path| fileset :dir => path }
+          lib_jars.each { |extra_jar| zipfileset :dir => File.dirname(extra_jar), :includes => File.basename(extra_jar), :prefix => 'lib' }
+        end
+      end
+    end
+    def load_path
+      Bundler.definition.specs_for(@options[:gem_groups]).flat_map do |spec|
+        if spec.full_name !~ /^(?:bundler|rubydoop)-\d+/
+          spec.require_paths.map do |rp|
+            "#{spec.full_gem_path}/#{rp}"
+          end
+        else
+          []
+        end
+      end
+    end
+  end
+end

data/lib/rubydoop/version.rb ADDED Viewed

@@ -0,0 +1,4 @@
+module Rubydoop
+  # @private
+  VERSION = '1.0.0'
+end

metadata ADDED Viewed

@@ -0,0 +1,53 @@
+--- !ruby/object:Gem::Specification
+name: rubydoop
+version: !ruby/object:Gem::Version
+  prerelease:
+  version: 1.0.0
+platform: java
+authors:
+- Theo Hultberg
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-10-01 00:00:00.000000000Z
+dependencies: []
+description: Rubydoop embeds a JRuby runtime in Hadoop, letting you write map reduce code in Ruby without using the streaming APIs
+email:
+- theo@iconara.net
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/hadoop.rb
+- lib/rubydoop.rb
+- lib/rubydoop/dsl.rb
+- lib/rubydoop/package.rb
+- lib/rubydoop/version.rb
+- lib/rubydoop.jar
+homepage: http://github.com/iconara/rubydoop
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+  none: false
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+  none: false
+requirements: []
+rubyforge_project: rubydoop
+rubygems_version: 1.8.15
+signing_key:
+specification_version: 3
+summary: Write Hadoop jobs in Ruby
+test_files: []
+has_rdoc:
+...