RubyGems - humboldt - Versions diffs - 1.0.0-java - Mend

humboldt 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/bin/humboldt +8 -0
data/config/emr-bootstrap/remove_old_jruby.sh +11 -0
data/config/hadoop-local.xml +12 -0
data/lib/ext/hadoop.rb +10 -0
data/lib/ext/rubydoop.rb +60 -0
data/lib/humboldt/cli.rb +263 -0
data/lib/humboldt/emr_flow.rb +198 -0
data/lib/humboldt/hadoop_status_filter.rb +97 -0
data/lib/humboldt/java_lib.rb +5 -0
data/lib/humboldt/mapper.rb +15 -0
data/lib/humboldt/patterns/sum_reducer.rb +16 -0
data/lib/humboldt/prefix_grouping.rb +46 -0
data/lib/humboldt/processor.rb +96 -0
data/lib/humboldt/reducer.rb +34 -0
data/lib/humboldt/rspec.rb +100 -0
data/lib/humboldt/type_converters.rb +180 -0
data/lib/humboldt/version.rb +5 -0
data/lib/humboldt.jar +0 -0
data/lib/humboldt.rb +16 -0
metadata +112 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 7ca8c0825572a27fc4673c042f4d5677e250340a
+  data.tar.gz: 910d598f4df79e42eda1bbf4d022db9a160c48c3
+SHA512:
+  metadata.gz: 6cf04eb473b93684dfde74f56ed6113d9c8d15fbfcc7686b33d878570a804ee5c29c11e0fdc29692b689f9b5039e8f0a21a93781d669c2ae4d4dc8384f7d436f
+  data.tar.gz: 84d6fab2395c97ebda5eac18684371ee475ee336823a4cb936f11196b8905911436af418bfb3386649c6c3c2097095c38a1aab2ada827f56e33f8020c50e6f38

data/bin/humboldt ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env ruby
+$: << File.expand_path('../../lib', __FILE__)
+require 'humboldt/cli'
+Humboldt::Cli.start

data/config/emr-bootstrap/remove_old_jruby.sh ADDED Viewed

@@ -0,0 +1,11 @@
+#!/bin/bash
+if [ -e /home/hadoop/lib/jruby-complete-no-joda-1.6.5.jar ]
+  then
+    rm /home/hadoop/lib/jruby-complete-no-joda-1.6.5.jar
+fi
+if [ -e /home/hadoop/lib/jruby-complete-1.6.8.jar ]
+  then
+    rm /home/hadoop/lib/jruby-complete-1.6.8.jar
+fi

data/config/hadoop-local.xml ADDED Viewed

@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<configuration>
+  <property>
+    <name>fs.default.name</name>
+    <value>file:///</value>
+  </property>
+  <property>
+    <name>mapred.job.tracker</name>
+    <value>local</value>
+  </property>
+</configuration>

data/lib/ext/hadoop.rb ADDED Viewed

@@ -0,0 +1,10 @@
+# encoding: utf-8
+module Hadoop
+  module FileCache
+    include_package 'org.apache.hadoop.filecache'
+  end
+  module Conf
+    include_package 'org.apache.hadoop.conf'
+  end
+end

data/lib/ext/rubydoop.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# encoding: utf-8
+module Rubydoop
+  class JobDefinition
+    alias mapperrr mapper
+    def mapper(cls)
+      map_output_key cls.output_key.const_get(:HADOOP) if cls.respond_to?(:output_key)
+      map_output_value cls.output_value.const_get(:HADOOP) if cls.respond_to?(:output_value)
+      mapperrr cls
+    end
+    alias reducerrr reducer
+    def reducer(cls)
+      output_key cls.output_key.const_get(:HADOOP) if cls.respond_to?(:output_key)
+      output_value cls.output_value.const_get(:HADOOP) if cls.respond_to?(:output_value)
+      reducerrr cls
+    end
+    alias inputtt input
+    def input(paths, options={})
+      options = options.dup
+      format = options[:format]
+      STDERR.puts "Warning! Using `format: :combined_text` will not work with remote input paths (e.g. S3) and Hadoop 1.x. Cf. https://issues.apache.org/jira/browse/MAPREDUCE-1806" if format == :combined_text
+      unless format.nil? or format.is_a?(Class)
+        class_name = format.to_s.gsub(/^.|_./) {|x| x[-1,1].upcase } + "InputFormat"
+        begin
+          options[:format] = Humboldt::JavaLib.const_get(class_name)
+        rescue NameError
+        end
+      end
+      inputtt(paths, options)
+    end
+    def enable_compression!
+      unless local_mode?
+        set 'mapred.compress.map.output', true
+        set 'mapred.output.compress', true
+        set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
+        set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
+        set 'mapred.output.compression.type', 'BLOCK'
+      end
+    end
+    def local_mode?
+      @job.configuration.get('mapred.job.tracker') == 'local'
+    end
+    def cache_file(file, options = {})
+      symlink = options.fetch(:as, File.basename(file))
+      if local_mode? && !Hadoop::Mapreduce::Job.instance_methods.include?(:add_cache_file)
+        unless File.symlink?(symlink) && File.readlink(symlink) == file
+          FileUtils.ln_s file, symlink
+        end
+      else
+        uri = java.net.URI.new("#{file}\##{symlink}")
+        Hadoop::FileCache::DistributedCache.add_cache_file(uri, @job.configuration)
+      end
+    end
+  end
+end

data/lib/humboldt/cli.rb ADDED Viewed

@@ -0,0 +1,263 @@
+# encoding: utf-8
+require 'thor'
+require 'aws'
+require 'open3'
+require 'rubydoop/package' # this prints an annoying warning in JRuby 1.7.0.RC1
+require 'humboldt/emr_flow'
+require 'humboldt/hadoop_status_filter'
+module Humboldt
+  class Cli < Thor
+    include Thor::Actions
+    DEFAULTS = {
+      data_path: 'data/completes',
+      silent: true,
+      skip_package: false,
+      extra_hadoop_args: [],
+      cleanup_before: false,
+      instance_count: 4,
+      instance_type: 'c1.xlarge',
+      spot_instances: nil,
+      bid_price: 0.2,
+      poll: false,
+      skip_prepare: false,
+      aws_region: 'eu-west-1',
+      hadoop_version: '1.0.3'
+    }
+    desc 'package', 'Package job JAR file'
+    def package
+      say_status(:package, relative_path(job_package.jar_path))
+      job_package.create!
+    end
+    desc 'run-local', 'run a job in local mode with the hadoop command'
+    method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved agains the data path'
+    method_option :output, :type => :string, :desc => 'the output directory, defaults to "data/<job_config>/output"'
+    method_option :job_config, :type => 'string', :desc => 'the name of the Ruby file containing the job configuration, defaults to the project name (e.g. "lib/<job_config>.rb")'
+    method_option :hadoop_config, :type => 'string', :desc => 'the path to a Hadoop configuration XML file, defaults to Humboldt-provided config that runs Hadoop in local-mode'
+    method_option :cleanup_before, :type => :boolean, :desc => "automatically remove the output dir before launching (default: #{DEFAULTS[:cleanup_before]})"
+    method_option :data_path, :type => :string, :desc => "input paths will be resolved against this path (default: #{DEFAULTS[:data_path]})"
+    method_option :silent, :type => :boolean, :desc => "silence the hadoop command's logging (default: #{DEFAULTS[:silent]})"
+    method_option :skip_package, :type => :boolean, :desc => "don't package the JAR, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_package]})"
+    method_option :extra_hadoop_args, :type => :array, :desc => "extra arguments to on pass to hadoop (default: #{DEFAULTS[:extra_hadoop_args]})"
+    def run_local
+      check_job!
+      invoke(:package, [], {}) unless options.skip_package?
+      output_path = options[:output] || "data/#{job_config}/output"
+      output_path_parent = File.dirname(output_path)
+      if options.cleanup_before?
+        remove_file(output_path)
+      else
+        check_local_output!(output_path)
+      end
+      unless File.exists?(output_path_parent)
+        empty_directory(output_path_parent)
+      end
+      input_glob = File.join(options[:data_path], options[:input])
+      hadoop_config_path = options[:hadoop_config] || default_hadoop_config_path
+      run_command('hadoop', 'jar', project_jar, '-conf', hadoop_config_path, job_config, input_glob, output_path, *options[:extra_hadoop_args])
+    end
+    desc 'run-emr', 'run a job in Elastic MapReduce'
+    method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved against the data bucket'
+    method_option :output, :type => :string, :desc => 'the output directory, defaults to "<project_name>/<job_config>/output" in the job bucket'
+    method_option :job_config, :type => 'string', :desc => 'the name of the Ruby file containing the job configuration, defaults to the project name (e.g. "lib/<job_config>.rb")'
+    method_option :cleanup_before, :type => :boolean, :desc => "automatically remove the output dir before launching (default: #{DEFAULTS[:cleanup_before]})"
+    method_option :data_bucket, :type => :string, :desc => "S3 bucket containing input data (default: #{DEFAULTS[:data_bucket]})"
+    method_option :job_bucket, :type => :string, :desc => "S3 bucket to upload JAR, output logs and results into (default: #{DEFAULTS[:job_bucket]})"
+    method_option :instance_count, :type => :numeric, :desc => "the number of worker instances to launch (default: #{DEFAULTS[:instance_count]})"
+    method_option :instance_type, :type => :string, :desc => "the worker instance type, see http://ec2pricing.iconara.info/ for available types (default: #{DEFAULTS[:instance_type]})"
+    method_option :spot_instances, :type => :array, :lazy_default => [], :desc => 'use spot instances; either an explicit list of instance groups or no value to run all groups as spot instances'
+    method_option :bid_price, :type => :string, :desc => "how much to bid for spot instances, see http://ec2pricing.iconara.info/ for current spot prices (default: #{DEFAULTS[:bid_price]})"
+    method_option :poll, :type => :boolean, :desc => "poll the job's status every 10s and display (default: #{DEFAULTS[:poll]})"
+    method_option :skip_package, :type => :boolean, :desc => "don't package the JAR, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_package]})"
+    method_option :skip_prepare, :type => :boolean, :desc => "don't upload the JAR and bootstrap files, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_prepare]})"
+    method_option :extra_hadoop_args, :type => :array, :desc => "extra arguments to pass on to hadoop (default: #{DEFAULTS[:extra_hadoop_args]})"
+    method_option :ec2_key_name, :type => :string, :desc => 'The name of an EC2 key pair to enable SSH access to master node'
+    method_option :aws_region, :type => :string, :desc => "The AWS region where the EMR flow is to run (default: #{DEFAULTS[:aws_region]})"
+    method_option :hadoop_version, :type => :string, :desc => "The EMR Hadoop version to use (default: #{DEFAULTS[:hadoop_version]})"
+    def run_emr
+      check_job!
+      invoke(:package, [], {}) unless options.skip_package?
+      flow = EmrFlow.new(job_config, options[:input], job_package, emr, data_bucket, job_bucket, options[:output])
+      if options.cleanup_before?
+        say_status(:remove, flow.output_uri)
+        flow.cleanup!
+      end
+      unless options.skip_prepare?
+        say_status(:upload, flow.jar_uri)
+        flow.prepare!
+      end
+      say_status(:warning, "No EC2 key name configured. You will not be able to access the master node via SSH.", :yellow) unless options[:ec2_key_name]
+      job_flow = flow.run!(
+        bid_price: options[:bid_price],
+        instance_count: options[:instance_count],
+        instance_type: options[:instance_type],
+        spot_instances: options[:spot_instances],
+        extra_hadoop_args: options[:extra_hadoop_args],
+        ec2_key_name: options[:ec2_key_name],
+        hadoop_version: options[:hadoop_version]
+      )
+      File.open('.humboldtjob', 'w') { |io| io.puts(job_flow.job_flow_id) }
+      say_status(:started, %{EMR job flow "#{job_flow.job_flow_id}"})
+    end
+    desc 'emr-job', 'show status of the last EMR job'
+    def emr_job
+      if File.exists?('.humboldtjob')
+        job_flow_id = File.read('.humboldtjob').strip
+        job_flow = emr.job_flows[job_flow_id]
+        print_job_flow_extended_status(job_flow)
+      else
+        say_status(:warning, 'Could not determine last job flow ID')
+      end
+    end
+    desc 'emr-jobs', 'list all EMR jobs'
+    def emr_jobs
+      emr.job_flows.each do |job_flow|
+        print_job_flow_status(job_flow)
+      end
+    end
+    desc 'configure', 'Configure humboldt for the current project'
+    def configure
+      say("Please ensure you are located at the root directory of the project you are configuring.", :yellow)
+      configuration = options_from_config_file
+      say('EMR configuration', :green)
+      configuration[:ec2_key_name] = ask("EC2 key pair name to enable SSH access to EMR master node: [#{config_file_options_with_defaults[:ec2_key_name]}]")
+      configuration[:aws_region] = ask("AWS region: [#{config_file_options_with_defaults[:aws_region]}]")
+      configuration[:hadoop_version] = ask("Hadoop version: [#{config_file_options_with_defaults[:hadoop_version]}]")
+      configuration[:data_bucket] = ask("Input data S3 bucket: [#{config_file_options_with_defaults[:data_bucket]}]")
+      configuration[:job_bucket] = ask("Job S3 bucket (where JAR is uploaded, output logs and job output go to): [#{config_file_options_with_defaults[:job_bucket]}]")
+      configuration.each do |key, value|
+        value = configuration[key] = config_file_options_with_defaults[key] if value.empty?
+        configuration.delete(key) if value.empty? || value == DEFAULTS[key]
+      end
+      File.open('.humboldt.yml', 'w') { |f| YAML.dump(configuration, f) }
+      say('Updated .humboldt.yml', :green)
+    end
+    no_commands do
+      def options
+        @extended_options ||= Thor::CoreExt::HashWithIndifferentAccess.new(config_file_options_with_defaults.merge(super))
+      end
+    end
+    private
+    ISO_DATE_TIME = '%Y-%m-%d %H:%M:%S'.freeze
+    def project_jar
+      @project_jar ||= Dir['build/*.jar'].reject { |path| path.start_with?('build/jruby-complete') }.first
+    end
+    def job_package
+      @job_package ||= Rubydoop::Package.new(lib_jars: Dir[File.expand_path('../../**/*.jar', __FILE__)])
+    end
+    def job_config
+      options[:job_config] || job_package.project_name
+    end
+    def default_hadoop_config_path
+      File.expand_path('../../../config/hadoop-local.xml', __FILE__)
+    end
+    def s3
+      @s3 ||= AWS::S3.new
+    end
+    def emr
+      @emr ||= AWS::EMR.new(region: options[:aws_region])
+    end
+    def job_bucket
+      @job_bucket ||= s3.buckets[options[:job_bucket]]
+    end
+    def data_bucket
+      @data_bucket ||= s3.buckets[options[:data_bucket]]
+    end
+    def check_job!
+      raise Thor::Error, "No such job: #{job_config}" unless File.exists?("lib/#{job_config}.rb")
+    end
+    def relative_path(path)
+      path.sub(Dir.pwd + '/', '')
+    end
+    def check_local_output!(path)
+      if File.exists?(path)
+        raise Thor::Error, "#{options[:output]} already exists!"
+      end
+    end
+    def run_command(*args)
+      say_status(:running, 'Hadoop started')
+      Open3.popen3(*args) do |stdin, stdout, stderr, wait_thr|
+        stdin.close
+        stdout_printer = Thread.new(stdout) do |stdout|
+          while line = stdout.gets
+            say(line.chomp)
+          end
+        end
+        stderr_printer = Thread.new(stderr) do |stderr|
+          filter = HadoopStatusFilter.new(stderr, self, options.silent?)
+          filter.run
+        end
+        stdout_printer.join
+        stderr_printer.join
+        if wait_thr.value.exitstatus == 0
+          say_status(:done, 'Job completed')
+        else
+          say_status(:failed, 'Job failed', :red)
+        end
+      end
+    end
+    def print_job_flow_extended_status(job_flow)
+      id = job_flow.job_flow_id
+      state = job_flow.state
+      created_at = job_flow.created_at.strftime(ISO_DATE_TIME)
+      change_reason = job_flow.last_state_change_reason
+      say_status(:started, created_at)
+      say_status(:state, state)
+      say_status(:change, change_reason)
+    rescue => e
+      say_status(:error, e.message, :red)
+      sleep 1
+      retry
+    end
+    def print_job_flow_status(job_flow)
+      id = job_flow.job_flow_id
+      state = job_flow.state
+      created_at = job_flow.created_at.strftime(ISO_DATE_TIME)
+      change_reason = job_flow.last_state_change_reason
+      say_status(:status, sprintf('%-15s %-10s %19s %s', id, state, created_at, change_reason))
+    rescue => e
+      say_status(:error, e.message, :red)
+      sleep 1
+      retry
+    end
+    def config_file_options_with_defaults
+      @config_file_options_with_defaults ||= DEFAULTS.merge(options_from_config_file)
+    end
+    def options_from_config_file
+      @options_from_config_file ||= begin
+        ::YAML::load_file(".humboldt.yml")
+      rescue Errno::ENOENT
+        {}
+      end
+    end
+  end
+end

data/lib/humboldt/emr_flow.rb ADDED Viewed

@@ -0,0 +1,198 @@
+# encoding: utf-8
+module Humboldt
+  class EmrFlow
+    attr_reader :output_path
+    def initialize(*args)
+      @job_name, @input_glob, @package, @emr, @data_bucket, @job_bucket, @output_path = args
+      @output_path ||= "#{@package.project_name}/#{@job_name}/output"
+    end
+    def prepare!
+      upload_bootstrap_task_files!
+      upload_jar!
+    end
+    def cleanup!
+      delete_output_dir!
+    end
+    def run!(launch_options={})
+      check_jar!
+      check_output_dir!
+      create_flow!(launch_options)
+    end
+    def jar_path
+      "#{@package.project_name}/#{File.basename(@package.jar_path)}"
+    end
+    def jar_uri
+      s3_uri(jar_path)
+    end
+    def output_uri
+      s3_uri(output_path)
+    end
+    def log_path
+      "#{@package.project_name}/#{@job_name}/logs"
+    end
+    private
+    BOOTSTRAP_TASK_FILES = {
+      :remove_old_jruby => 'config/emr-bootstrap/remove_old_jruby.sh'
+    }.freeze
+    def s3_uri(path, options={})
+      protocol = options[:protocol] || 's3'
+      bucket = options[:bucket] || @job_bucket
+      "#{protocol}://#{bucket.name}/#{path}"
+    end
+    def upload_bootstrap_task_files!
+      BOOTSTRAP_TASK_FILES.values.each do |local_path|
+        remote_obj = @job_bucket.objects["#{@package.project_name}/#{local_path}"]
+        remote_obj.write(Pathname.new(File.expand_path(local_path, "#{__FILE__}/../../..")))
+      end
+    end
+    def upload_jar!
+      # TODO: upload only if not exists and MD5 != ETag
+      jar_obj = @job_bucket.objects[jar_path]
+      jar_obj.write(Pathname.new(@package.jar_path))
+    end
+    def check_jar!
+      unless @job_bucket.objects.with_prefix(jar_path).any?
+        raise "Job JAR missing (#{s3_uri(jar_path)}"
+      end
+    end
+    def check_output_dir!
+      if @job_bucket.objects.with_prefix(output_path).any?
+        raise "Output directory already exists (#{s3_uri(output_path)})"
+      end
+    end
+    def delete_output_dir!
+      @job_bucket.objects.with_prefix(output_path).delete_all
+    end
+    def job_flow_configuration(launch_options)
+      {
+        :log_uri => s3_uri(log_path),
+        :instances => instance_configuration(launch_options),
+        :steps => [step_configuration(launch_options)],
+        :bootstrap_actions => bootstrap_actions,
+        :visible_to_all_users => true
+      }
+    end
+    def instance_configuration(launch_options)
+      {
+        :ec2_key_name => launch_options[:ec2_key_name],
+        :hadoop_version => launch_options[:hadoop_version],
+        :instance_groups => InstanceGroupConfiguration.create(launch_options)
+      }
+    end
+    def bootstrap_actions
+      remove_old_jruby_action = {
+        :name => 'remove_old_jruby',
+        :script_bootstrap_action => {
+          :path => s3_uri("#{@package.project_name}/#{BOOTSTRAP_TASK_FILES[:remove_old_jruby]}")
+        }
+      }
+      # http://hadoop.apache.org/docs/r1.0.3/mapred-default.html
+      configure_hadoop_action = {
+        :name => 'configure_hadoop',
+        :script_bootstrap_action => {
+          :path => 's3://eu-west-1.elasticmapreduce/bootstrap-actions/configure-hadoop',
+          :args => [
+            '-m', 'mapred.job.reuse.jvm.num.tasks=-1',
+            '-m', 'mapred.map.tasks.speculative.execution=false',
+            '-m', 'mapred.reduce.tasks.speculative.execution=false'
+          ]
+        }
+      }
+      [remove_old_jruby_action, configure_hadoop_action]
+    end
+    def step_configuration(launch_options)
+      {
+        :name => @package.project_name,
+        :hadoop_jar_step => {
+          :jar => s3_uri(jar_path),
+          :args => [
+            @job_name,
+            s3_uri(@input_glob, protocol: 's3n', bucket: @data_bucket),
+            s3_uri(output_path, protocol: 's3n'),
+            *launch_options[:extra_hadoop_args]
+          ]
+        }
+      }
+    end
+    def create_flow!(launch_options)
+      job_flow = @emr.job_flows.create(@package.project_name, job_flow_configuration(launch_options))
+    end
+    module InstanceGroupConfiguration
+      extend self
+      # TODO: add 'task' group when support is added for 'tasks'
+      INSTANCE_GROUPS = %w[master core].freeze
+      MASTER_INSTANCE_TYPE = 'm1.small'.freeze
+      DEFAULT_CORE_INSTANCE_TYPE = 'c1.xlarge'.freeze
+      DEFAULT_BID_PRICE = '0.2'.freeze
+      DEFAULT_CORE_INSTANCE_COUNT = 4
+      INSTANCE_TYPE_MAPPINGS = {
+        'master' => MASTER_INSTANCE_TYPE,
+        'core'   => DEFAULT_CORE_INSTANCE_TYPE
+      }.freeze
+      INSTANCE_COUNT_MAPPINGS = {
+        'master' => 1,
+        'core' => DEFAULT_CORE_INSTANCE_COUNT
+      }.freeze
+      def base_configuration(group)
+        {:name => "#{group.capitalize} Group", :instance_role => group.upcase}
+      end
+      def configure_type_and_count(group, configuration, options = {})
+        if group == 'core'
+          configuration[:instance_type] = options[:instance_type]
+          configuration[:instance_count] = options[:instance_count]
+        end
+        configuration[:instance_type] ||= INSTANCE_TYPE_MAPPINGS[group]
+        configuration[:instance_count] ||= INSTANCE_COUNT_MAPPINGS[group]
+      end
+      def configure_market(group, configuration, spot_instances, bid_price)
+        if spot_instances && (spot_instances.empty? || spot_instances.include?(group))
+          configuration[:market] = 'SPOT'
+          configuration[:bid_price] = bid_price || DEFAULT_BID_PRICE
+        else
+          configuration[:market] = 'ON_DEMAND'
+        end
+      end
+      def create(options)
+        INSTANCE_GROUPS.map do |group|
+          configuration = base_configuration(group)
+          configure_type_and_count(group, configuration, options)
+          configure_market(group, configuration, options[:spot_instances], options[:bid_price])
+          configuration
+        end
+      end
+    end
+  end
+end

data/lib/humboldt/hadoop_status_filter.rb ADDED Viewed

@@ -0,0 +1,97 @@
+# encoding: utf-8
+module Humboldt
+  class HadoopStatusFilter
+    def initialize(hadoop_stderr, shell, silent)
+      @hadoop_stderr = hadoop_stderr
+      @shell = shell
+      @silent = silent
+      @counters = {}
+    end
+    def run
+      counter_group = nil
+      while line = @hadoop_stderr.gets
+        if @counters_printing && (hadoop_log?(line) || line =~ /^\t+/)
+          case line.chomp
+          when /(?:JobClient:     |\t+)([^\t]+)=(\d+)$/
+            if counter_group
+              @counters[counter_group] ||= {}
+              @counters[counter_group][$1.strip] = $2.to_i
+            end
+          when /(?:JobClient:   |\t+)([^\t]+)$/
+            counter_group = $1.strip
+          end
+        elsif @error_printing && !hadoop_log?(line) && !ignore?(line)
+          report_error(line)
+        elsif ignore?(line)
+          # do nothing
+        else
+          @counters_printing = false
+          @error_printing = false
+          case line
+          when /map (\d+)% reduce (\d+)%/
+            report_progress($1, $2)
+          when /Counters: \d+/
+            @counters_printing = true
+          else
+            unless hadoop_log?(line)
+              @error_printing = true
+              if line =~ /warning(!|:)/i
+                @error_type = :warning
+              else
+                @error_type = :error
+              end
+              report_error(line)
+            end
+          end
+        end
+        @shell.say(line.chomp, :red) unless @silent
+      end
+      print_counters_table
+    end
+    private
+    def hadoop_log?(line)
+      line =~ /(?:INFO|WARN) (?:mapred|input|output|util|jvm|mapreduce)\./
+    end
+    def ignore?(line)
+      case line
+      when /^\s*$/,
+           /Warning: \$HADOOP_HOME is deprecated/,
+           /Unable to load realm info from SCDynamicStore/,
+           /Unable to load native-hadoop library/,
+           /Snappy native library not loaded/,
+           /Configuration.deprecation:/,
+           /WARN conf.Configuration.*attempt to override final parameter.*ignoring/i
+        true
+      else
+        false
+      end
+    end
+    def report_progress(map, reduce)
+      @shell.say_status(:progress, "map #{map}%, reduce #{reduce}%")
+    end
+    def report_error(line)
+      @shell.say_status(@error_type, line.chomp, @error_type == :error ? :red : :yellow)
+    end
+    def print_counters_table
+      table = @counters.flat_map do |group, counters|
+        [
+          [group, *counters.first],
+          *counters.drop(1).map { |counter, value| ['', counter, value] },
+          ['', '', '']
+        ]
+      end
+      table.pop
+      @shell.say
+      @shell.print_table(table)
+      @shell.say
+    end
+  end
+end

data/lib/humboldt/java_lib.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Humboldt
+  module JavaLib
+    include_package 'humboldt'
+  end
+end

data/lib/humboldt/mapper.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# encoding: utf-8
+module Humboldt
+  class Mapper < Processor
+    class << self
+      def map(&block)
+        define_method(:map) do |key, value, context|
+          @in_key.hadoop = key
+          @in_value.hadoop = value
+          instance_exec(@in_key.ruby, @in_value.ruby, &block)
+        end
+      end
+    end
+  end
+end

data/lib/humboldt/patterns/sum_reducer.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# encoding: utf-8
+module Humboldt
+  module Patterns
+    class SumReducer < Reducer
+      input :text, :long
+      output :text, :long
+      reduce do |key, values|
+        sum = 0
+        values.each { |v| sum += v }
+        emit(key, sum)
+      end
+    end
+  end
+end

data/lib/humboldt/prefix_grouping.rb ADDED Viewed

@@ -0,0 +1,46 @@
+# encoding: utf-8
+require 'zlib'
+module Humboldt
+  class BinaryPrefixPartitioner
+    def initialize(cutoff_index)
+      @cutoff_index = cutoff_index
+    end
+    def partition(key, value, num_partitions)
+      length = @cutoff_index > key.length ? key.length : @cutoff_index
+      prefix = String.from_java_bytes(key.bytes)[0, length]
+      Zlib.crc32(prefix) % num_partitions
+    end
+  end
+  class DropBinaryPrefixPartitioner < BinaryPrefixPartitioner
+    def partition(key, value, num_partitions)
+      length = key.length > @cutoff_index ? key.length - @cutoff_index : 0
+      prefix = String.from_java_bytes(key.bytes)[0, length]
+      Zlib.crc32(prefix) % num_partitions
+    end
+  end
+  class BinaryPrefixComparator
+    def initialize(cutoff_index)
+      @cutoff_index = cutoff_index
+    end
+    def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
+      subset_length1 = @cutoff_index > length1 ? length1 : @cutoff_index
+      subset_length2 = @cutoff_index > length2 ? length2 : @cutoff_index
+      ::Hadoop::Io::WritableComparator.compareBytes(bytes1, start1, subset_length1, bytes2, start2, subset_length2)
+    end
+  end
+  class DropBinaryPrefixComparator < BinaryPrefixComparator
+    def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
+      subset_length1 = length1 - @cutoff_index
+      subset_length2 = length2 - @cutoff_index
+      ::Hadoop::Io::WritableComparator.compareBytes(bytes1, start1, subset_length1, bytes2, start2, subset_length2)
+    end
+  end
+end

data/lib/humboldt/processor.rb ADDED Viewed

@@ -0,0 +1,96 @@
+# encoding: utf-8
+module Humboldt
+  class Processor
+    class << self
+      def self.type_accessor(*names)
+        names.each do |name|
+          module_eval <<-EOA
+            def #{name}
+              @#{name} || superclass.#{name}
+            end
+            def #{name}=(type)
+              @#{name} = TypeConverter[type]
+              define_method(:#{name}_accessor) do
+                TypeConverter[type].new
+              end
+            end
+          EOA
+        end
+      end
+      type_accessor :input_key, :input_value, :output_key, :output_value
+      def input(*types)
+        self.input_key = types.first
+        self.input_value = types.last
+      end
+      def output(*types)
+        self.output_key = types.first
+        self.output_value = types.last
+      end
+      def setup(&block)
+        define_method(:instance_setup, &block)
+        private(:instance_setup)
+      end
+      def cleanup(&block)
+        define_method(:instance_cleanup, &block)
+        private(:instance_cleanup)
+      end
+    end
+    attr_reader :current_context
+    def setup(context)
+      @current_context = context
+      @in_key = input_key_accessor
+      @in_value = input_value_accessor
+      @out_key = output_key_accessor
+      @out_value = output_value_accessor
+      unless Hadoop::Mapreduce::Job.instance_methods.include?(:add_cache_file)
+        create_symlinks!
+      end
+      instance_setup
+    end
+    def cleanup(context)
+      instance_cleanup
+    end
+    protected
+    def emit(key, value)
+      @out_key.ruby = key
+      @out_value.ruby = value
+      @current_context.write(@out_key.hadoop, @out_value.hadoop)
+    end
+    private
+    def instance_setup
+    end
+    def instance_cleanup
+    end
+    def create_symlinks!
+      distributed_cache = ::Hadoop::FileCache::DistributedCache
+      files = distributed_cache.get_cache_files(@current_context.configuration)
+      local_files = distributed_cache.get_local_cache_files(@current_context.configuration)
+      if files && local_files
+        work_dir = ENV['HADOOP_WORK_DIR']
+        files.each_with_index do |file, i|
+          target = local_files[i].to_s
+          link_path = File.join(work_dir, file.fragment)
+          FileUtils.mkdir_p(File.dirname(link_path))
+          unless File.exists?(link_path)
+            FileUtils.ln_s(target, link_path)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/humboldt/reducer.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# encoding: utf-8
+module Humboldt
+  class Reducer < Processor
+    class << self
+      def reduce(&block)
+        define_method(:reduce) do |key, values, context|
+          @in_key.hadoop = key
+          values_enumerator = TypeConversionEnumerator.new(@in_value, values.iterator)
+          instance_exec(@in_key.ruby, values_enumerator, &block)
+        end
+      end
+    end
+    class TypeConversionEnumerator < Enumerator
+      def initialize(*args)
+        @value_converter, @hadoop_iterator = args
+      end
+      def each
+        while @hadoop_iterator.has_next
+          @value_converter.hadoop = @hadoop_iterator.next
+          yield @value_converter.ruby
+        end
+      end
+      def next
+        raise StopIteration unless @hadoop_iterator.has_next
+        @value_converter.hadoop = @hadoop_iterator.next
+        @value_converter.ruby
+      end
+    end
+  end
+end

data/lib/humboldt/rspec.rb ADDED Viewed

@@ -0,0 +1,100 @@
+# encoding: utf-8
+require 'humboldt'
+module RunnerHelpers
+  def run_nokey_mapper(mapper, *values, &context_callback)
+    key = mapper.input_key_accessor.ruby
+    args = values.map { |value| [key, value] }
+    run_mapper(mapper, *args, &context_callback)
+  end
+  def run_mapper(mapper, *entries, &context_callback)
+    in_value = mapper.input_value_accessor
+    run(mapper, :map, context_callback, *entries) do |value|
+      in_value.ruby = value
+      in_value.hadoop
+    end
+  end
+  def run_reducer(reducer, *entries, &context_callback)
+    run(reducer, :reduce, context_callback, *entries) do |value|
+      fake_iterator(*value.map do |v|
+        in_value = reducer.input_value_accessor
+        in_value.ruby = v
+        in_value.hadoop
+      end)
+    end
+  end
+  def run(runner, method, context_callback, *entries)
+    in_key = runner.input_key_accessor
+    context = FakeContext.new(runner.output_key_accessor, runner.output_value_accessor)
+    context_callback.call(context) if context_callback
+    runner.setup(context)
+    entries.each do |entry|
+      in_key.ruby = entry.first
+      runner.send(method, in_key.hadoop, yield(entry.last), context)
+    end
+    runner.cleanup(context)
+    context.results
+  end
+  def fake_iterator(*values)
+    FakeIterable.new(values)
+  end
+  class FakeIterable
+    def initialize(values)
+      @values = values
+    end
+    def iterator
+      FakeIterator.new(@values.dup)
+    end
+  end
+  class FakeIterator
+    def initialize(values)
+      @values = values
+    end
+    def has_next
+      !@values.empty?
+    end
+    def next
+      @values.shift
+    end
+  end
+  class FakeContext
+    attr_reader :results, :counters
+    def initialize(key_accessor, value_accessor)
+      @key_accessor, @value_accessor = key_accessor, value_accessor
+      @results = []
+      @counters = Hash.new { |h,k| h[k] = Hash.new { |h2,k2| h2[k2] = 0 } }
+    end
+    def write(key, value)
+      @key_accessor.hadoop = key
+      @value_accessor.hadoop = value
+      @results << [@key_accessor.ruby, @value_accessor.ruby]
+    end
+    def configuration
+      @configuration ||= ::Hadoop::Conf::Configuration.new.tap do |config|
+        config.set 'mapred.job.tracker', 'local'
+      end
+    end
+    def get_counter(group, name)
+      FakeCounter.new do |amount|
+        @counters[group][name] += amount
+      end
+    end
+  end
+end
+RSpec.configure do |conf|
+  conf.include(RunnerHelpers)
+end

data/lib/humboldt/type_converters.rb ADDED Viewed

@@ -0,0 +1,180 @@
+# encoding: utf-8
+module Humboldt
+  module TypeConverter
+    class Binary
+      HADOOP = ::Hadoop::Io::BytesWritable
+      RUBY = ::String
+      attr_reader :hadoop
+      def hadoop=(value)
+        unless value.is_a?(HADOOP)
+          raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
+        end
+        @hadoop = value
+      end
+      def initialize
+        @hadoop = HADOOP.new
+      end
+      def ruby
+        String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length)
+      end
+      def ruby=(value)
+        unless value.is_a?(RUBY)
+          raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
+        end
+        @hadoop.set(value.to_java_bytes, 0, value.bytesize)
+      end
+    end
+    begin
+      require 'msgpack'
+      class Encoded < Binary
+        def ruby=(value)
+          unless value.is_a?(Hash) || value.is_a?(Array)
+            raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected Hash or Array"
+          end
+          packed = MessagePack.pack(value)
+          @hadoop.set(packed.to_java_bytes, 0, packed.bytesize)
+        end
+        def ruby
+          packed = String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length)
+          MessagePack.unpack(packed, encoding: Encoding::UTF_8)
+        end
+      end
+    rescue LoadError
+    end
+    class Text
+      HADOOP = ::Hadoop::Io::Text
+      RUBY = ::String
+      attr_reader :hadoop
+      def hadoop=(value)
+        unless value.is_a?(HADOOP)
+          raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
+        end
+        @hadoop = value
+      end
+      def initialize
+        @hadoop = HADOOP.new
+      end
+      def ruby
+        String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length).force_encoding(Encoding::UTF_8)
+      end
+      def ruby=(value)
+        unless value.is_a?(RUBY)
+          raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
+        end
+        if value.encoding == Encoding::UTF_8
+          @hadoop.set(value.to_java_bytes, 0, value.bytesize)
+        else
+          @hadoop.set(value)
+        end
+      end
+    end
+    begin
+      require 'json'
+      class Json < Text
+        def ruby=(value)
+          unless value.is_a?(Hash) || value.is_a?(Array)
+            raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected Hash or Array"
+          end
+          @hadoop.set(JSON.generate(value))
+        end
+        def ruby
+          JSON.parse(hadoop.to_s)
+        end
+      end
+    end
+    class Long
+      HADOOP = ::Hadoop::Io::LongWritable
+      RUBY = ::Integer
+      attr_reader :hadoop
+      def hadoop=(value)
+        unless value.is_a?(HADOOP)
+          raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
+        end
+        @hadoop = value
+      end
+      def initialize
+        @hadoop = HADOOP.new
+      end
+      def ruby
+        @hadoop.get
+      end
+      def ruby=(value)
+        unless value.is_a?(Integer)
+          raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
+        end
+        @hadoop.set value
+      end
+    end
+    class None
+      HADOOP = ::Hadoop::Io::NullWritable
+      RUBY = ::NilClass
+      def hadoop
+        HADOOP.get
+      end
+      def hadoop=(value)
+        unless value.is_a?(HADOOP)
+          raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
+        end
+      end
+      def ruby
+        nil
+      end
+      def ruby=(value)
+        unless value.nil?
+          raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
+        end
+      end
+    end
+    TYPE_CONVERTER_CLASS_CACHE = Hash.new { |h,k| h[k] = const_get(k.to_s.capitalize) }
+    def self.[](name)
+      TYPE_CONVERTER_CLASS_CACHE[name]
+    end
+    FROM_HADOOP_MAPPINGS = {
+      ::Hadoop::Io::Text => Text,
+      ::Hadoop::Io::BytesWritable => Binary,
+      ::Hadoop::Io::LongWritable => Long,
+      ::Hadoop::Io::NullWritable => None
+    }.freeze
+    def self.from_hadoop(hadoop_class)
+      accessor = FROM_HADOOP_MAPPINGS[hadoop_class]
+      raise ArgumentError, "Unsupported Hadoop type: #{hadoop_class}" unless accessor
+      accessor
+    end
+  end
+end

data/lib/humboldt/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# encoding: utf-8
+module Humboldt
+  VERSION = '1.0.0'.freeze
+end

data/lib/humboldt.jar ADDED Viewed

Binary file

data/lib/humboldt.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# encoding: utf-8
+require 'fileutils'
+require 'rubydoop'
+require 'hadoop'
+require 'humboldt/java_lib'
+require 'ext/hadoop'
+require 'ext/rubydoop'
+require 'humboldt/type_converters'
+require 'humboldt/processor'
+require 'humboldt/mapper'
+require 'humboldt/reducer'
+require 'humboldt/prefix_grouping'

metadata ADDED Viewed

@@ -0,0 +1,112 @@
+--- !ruby/object:Gem::Specification
+name: humboldt
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+platform: java
+authors:
+- The Burt Platform Team
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-06-03 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  name: thor
+  prerelease: false
+  type: :runtime
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.1.2
+  name: rubydoop
+  prerelease: false
+  type: :runtime
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.1.2
+- !ruby/object:Gem::Dependency
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: 1.16.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 1.33.0
+  name: aws-sdk
+  prerelease: false
+  type: :runtime
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: 1.16.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 1.33.0
+description: Humboldt provides a mapreduce API abstraction built on top of Rubydoop, and tools to run Hadoop jobs effortlessly both locally and on Amazon EMR
+email:
+- theo@burtcorp.com
+executables:
+- humboldt
+extensions: []
+extra_rdoc_files: []
+files:
+- bin/humboldt
+- config/emr-bootstrap/remove_old_jruby.sh
+- config/hadoop-local.xml
+- lib/ext/hadoop.rb
+- lib/ext/rubydoop.rb
+- lib/humboldt.jar
+- lib/humboldt.rb
+- lib/humboldt/cli.rb
+- lib/humboldt/emr_flow.rb
+- lib/humboldt/hadoop_status_filter.rb
+- lib/humboldt/java_lib.rb
+- lib/humboldt/mapper.rb
+- lib/humboldt/patterns/sum_reducer.rb
+- lib/humboldt/prefix_grouping.rb
+- lib/humboldt/processor.rb
+- lib/humboldt/reducer.rb
+- lib/humboldt/rspec.rb
+- lib/humboldt/type_converters.rb
+- lib/humboldt/version.rb
+homepage: http://github.com/burtcorp/humboldt
+licenses:
+- BSD-3-Clause
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Tools and libraries for simplifying running Rubydoop jobs locally and on AWS Elastic MapReduce
+test_files: []