RubyGems - humboldt - Versions diffs - 1.0.5-java → 1.1.0-java - Mend

humboldt 1.0.5-java → 1.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/lib/ext/hadoop.rb +7 -0
data/lib/ext/rubydoop.rb +72 -6
data/lib/humboldt/cli.rb +3 -0
data/lib/humboldt/emr_flow.rb +1 -0
data/lib/humboldt/patterns/sum_reducer.rb +1 -0
data/lib/humboldt/prefix_grouping.rb +4 -0
data/lib/humboldt/version.rb +1 -1
data/lib/humboldt.jar +0 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 0dd6faa77ad31ace7326eeee615ceba4a3a34508
-  data.tar.gz: 157bc24062834dbe6d2e0f343d1213c414446ef2
+  metadata.gz: 1b68b1b122d8e69538a1ed888cd8ede41e07a4a7
+  data.tar.gz: b658d3c93c062116ecec9c1b1f3b09fb26b5ff49
 SHA512:
-  metadata.gz: 54d0fcda75d41fbba31402449efa494031202ca76fb7f45ea19494effade4c081c326062e505e9967910320940dba3ecf7b9a15a9d0e917ebf37b31c1234803a
-  data.tar.gz: f0b075d53128a28cee17cf565e44dd2c110a8f2284ea801fe1e93a7f441dd670c924ce881dfa8bada2c0960bf0fe9846d85d07a2dcd5e762fbff009ef275231e
+  metadata.gz: 17780975cfc99cb6f4f4639d23dc3c3336085c5343e37b37841eb7d8eacfc49d7963d5f217d90f9495f4ed8aed1ac36e45419e79d7dddc1cce93aa87b08621c6
+  data.tar.gz: 8b79f3d82f416d5180ff657ca01de80b6370b745a6587aec44efcdfb4918196d0d0b61c84f46ca3fbb4ca0d10e249c5cbc1cf64e241aaa7b3c47771ef76dd43c

data/lib/ext/hadoop.rb CHANGED Viewed

@@ -7,4 +7,11 @@ module Hadoop
   module Conf
     include_package 'org.apache.hadoop.conf'
   end
+  module Mapreduce
+    module Lib
+      module Partition
+        include_package 'org.apache.hadoop.mapreduce.lib.partition'
+      end
+    end
+  end
 end

data/lib/ext/rubydoop.rb CHANGED Viewed

@@ -33,16 +33,29 @@ module Rubydoop
     def enable_compression!
       unless local_mode?
-        set 'mapred.compress.map.output', true
-        set 'mapred.output.compress', true
-        set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
-        set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
-        set 'mapred.output.compression.type', 'BLOCK'
+        if framework == :mapreduce
+          set 'mapreduce.map.output.compress', true
+          set 'mapreduce.output.fileoutputformat.compress', true
+          set 'mapreduce.map.output.compress.codec', 'org.apache.hadoop.io.compress.GzipCodec'
+          set 'mapreduce.output.fileoutputformat.compress.codec', 'org.apache.hadoop.io.compress.GzipCodec'
+          set 'mapreduce.output.fileoutputformat.compress.type', 'BLOCK'
+        else
+          set 'mapred.compress.map.output', true
+          set 'mapred.output.compress', true
+          set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
+          set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
+          set 'mapred.output.compression.type', 'BLOCK'
+        end
       end
     end
+    def framework
+      @framework ||= @job.configuration.get('mapreduce.framework.name') ? :mapreduce : :mapred
+    end
     def local_mode?
-      @job.configuration.get('mapred.job.tracker') == 'local'
+      property = framework == :mapreduce ? 'mapreduce.framework.name' : 'mapred.job.tracker'
+      @job.configuration.get(property) == 'local'
     end
     def cache_file(file, options = {})
@@ -56,5 +69,58 @@ module Rubydoop
         Hadoop::FileCache::DistributedCache.add_cache_file(uri, @job.configuration)
       end
     end
+    # Configures the job for secondary sort on the specified slice of the mapper
+    # output key.
+    #
+    # Hadoop comes with a partitioner that can partition the map output based
+    # on a slice of the map output key. Humboldt ships with a comparator that
+    # uses the same configuration. Together they can be used to implement
+    # secondary sort.
+    #
+    # Secondary sort is a mapreduce pattern where you emit a key, but partition
+    # and group only on a subset of that key. This has the result that each
+    # reduce invocation will see values grouped by the subset, but ordered by
+    # the whole key. It is used, among other things, to efficiently count
+    # distinct values.
+    #
+    # Say you want to count the number of distinct visitors to a site. Your
+    # input is pairs of site and visitor IDs. The naïve implementation is to
+    # emit the site as key and the visitor ID as value and then, in the reducer,
+    # collect all IDs in a set, and emit the site and the size of the set of IDs.
+    # This is very memory inefficient, and impractical. For any interesting
+    # amount of data you will not be able to keep all the visitor IDs in memory.
+    #
+    # What you do, instead, is to concatenate the site and visitor ID and emit
+    # that as key, and the visitor ID as value. It might seem wasteful to emit
+    # the visitor ID twice, but it's necessary since Hadoop will only give you
+    # the key for the first value in each group.
+    #
+    # You then instruct Hadoop to partition and group on just the site part of
+    # the key. Hadoop will still sort the values by their full key, so within
+    # each group the values will be sorted by visitor ID. In the reducer it's
+    # now trivial to loop over the values and just increment a counter each time
+    # the visitor ID changes.
+    #
+    # You configure which part of the key to partition and group by specifying
+    # the start and end _indexes_. The reason why they are indexes and not a
+    # start index and a length, like Ruby's `String#slice`, is that you also can
+    # use negative indexes to count from the end. Negative indexes are useful
+    # for example when you don't know how wide the part of the key that you want
+    # use is. In the example above if you use the domain to identify sites these
+    # can be of different length. If your visitor IDs are 20 characters you can
+    # use 0 and -20 as your indexes.
+    #
+    # @param [Fixnum] start_index The first index of the slice, negative numbers
+    #   are counted from the end
+    # @param [Fixnum] end_index The last index of the slice, negative numbers
+    #   are counted from the end
+    # @see http://hadoop.apache.org/docs/r2.7.1/api/org/apache/hadoop/mapreduce/lib/partition/BinaryPartitioner.html Hadoop's BinaryPartitioner
+    def secondary_sort(start_index, end_index)
+      @job.set_partitioner_class(Hadoop::Mapreduce::Lib::Partition::BinaryPartitioner)
+      Hadoop::Mapreduce::Lib::Partition::BinaryPartitioner.set_offsets(@job.configuration, start_index, end_index)
+      @job.set_grouping_comparator_class(Humboldt::JavaLib::BinaryComparator)
+      Humboldt::JavaLib::BinaryComparator.set_offsets(@job.configuration, start_index, end_index)
+    end
   end
 end

data/lib/humboldt/cli.rb CHANGED Viewed

@@ -63,6 +63,7 @@ module Humboldt
       run_command('hadoop', 'jar', project_jar, '-conf', hadoop_config_path, job_config, input_glob, output_path, *options[:extra_hadoop_args])
     end
+    # @deprecated EMR support will be removed in 2.0
     desc 'run-emr', 'run a job in Elastic MapReduce'
     method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved against the data bucket'
     method_option :output, :type => :string, :desc => 'the output directory, defaults to "<project_name>/<job_config>/output" in the job bucket'
@@ -107,6 +108,7 @@ module Humboldt
       say_status(:started, %{EMR job flow "#{job_flow.job_flow_id}"})
     end
+    # @deprecated EMR support will be removed in 2.0
     desc 'emr-job', 'show status of the last EMR job'
     def emr_job
       if File.exists?('.humboldtjob')
@@ -118,6 +120,7 @@ module Humboldt
       end
     end
+    # @deprecated EMR support will be removed in 2.0
     desc 'emr-jobs', 'list all EMR jobs'
     def emr_jobs
       emr.job_flows.each do |job_flow|

data/lib/humboldt/emr_flow.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # encoding: utf-8
 module Humboldt
+  # @deprecated EMR support will be removed in 2.0
   class EmrFlow
     attr_reader :output_path

data/lib/humboldt/patterns/sum_reducer.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 module Humboldt
   module Patterns
+    # @deprecated Use Hadoop's LongSumReducer
     class SumReducer < Reducer
       input :text, :long
       output :text, :long

data/lib/humboldt/prefix_grouping.rb CHANGED Viewed

@@ -4,6 +4,7 @@ require 'zlib'
 module Humboldt
+  # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
   class BinaryPrefixPartitioner
     def initialize(cutoff_index)
       @cutoff_index = cutoff_index
@@ -16,6 +17,7 @@ module Humboldt
     end
   end
+  # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
   class DropBinaryPrefixPartitioner < BinaryPrefixPartitioner
     def partition(key, value, num_partitions)
       length = key.length > @cutoff_index ? key.length - @cutoff_index : 0
@@ -24,6 +26,7 @@ module Humboldt
     end
   end
+  # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
   class BinaryPrefixComparator
     def initialize(cutoff_index)
       @cutoff_index = cutoff_index
@@ -36,6 +39,7 @@ module Humboldt
     end
   end
+  # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
   class DropBinaryPrefixComparator < BinaryPrefixComparator
     def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
       subset_length1 = length1 - @cutoff_index

data/lib/humboldt/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # encoding: utf-8
 module Humboldt
-  VERSION = '1.0.5'.freeze
+  VERSION = '1.1.0'.freeze
 end

data/lib/humboldt.jar CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: humboldt
 version: !ruby/object:Gem::Version
-  version: 1.0.5
+  version: 1.1.0
 platform: java
 authors:
 - The Burt Platform Team
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-09-03 00:00:00.000000000 Z
+date: 2015-10-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   requirement: !ruby/object:Gem::Requirement