humboldt 1.0.5-java → 1.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0dd6faa77ad31ace7326eeee615ceba4a3a34508
4
- data.tar.gz: 157bc24062834dbe6d2e0f343d1213c414446ef2
3
+ metadata.gz: 1b68b1b122d8e69538a1ed888cd8ede41e07a4a7
4
+ data.tar.gz: b658d3c93c062116ecec9c1b1f3b09fb26b5ff49
5
5
  SHA512:
6
- metadata.gz: 54d0fcda75d41fbba31402449efa494031202ca76fb7f45ea19494effade4c081c326062e505e9967910320940dba3ecf7b9a15a9d0e917ebf37b31c1234803a
7
- data.tar.gz: f0b075d53128a28cee17cf565e44dd2c110a8f2284ea801fe1e93a7f441dd670c924ce881dfa8bada2c0960bf0fe9846d85d07a2dcd5e762fbff009ef275231e
6
+ metadata.gz: 17780975cfc99cb6f4f4639d23dc3c3336085c5343e37b37841eb7d8eacfc49d7963d5f217d90f9495f4ed8aed1ac36e45419e79d7dddc1cce93aa87b08621c6
7
+ data.tar.gz: 8b79f3d82f416d5180ff657ca01de80b6370b745a6587aec44efcdfb4918196d0d0b61c84f46ca3fbb4ca0d10e249c5cbc1cf64e241aaa7b3c47771ef76dd43c
data/lib/ext/hadoop.rb CHANGED
@@ -7,4 +7,11 @@ module Hadoop
7
7
  module Conf
8
8
  include_package 'org.apache.hadoop.conf'
9
9
  end
10
+ module Mapreduce
11
+ module Lib
12
+ module Partition
13
+ include_package 'org.apache.hadoop.mapreduce.lib.partition'
14
+ end
15
+ end
16
+ end
10
17
  end
data/lib/ext/rubydoop.rb CHANGED
@@ -33,16 +33,29 @@ module Rubydoop
33
33
 
34
34
  def enable_compression!
35
35
  unless local_mode?
36
- set 'mapred.compress.map.output', true
37
- set 'mapred.output.compress', true
38
- set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
39
- set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
40
- set 'mapred.output.compression.type', 'BLOCK'
36
+ if framework == :mapreduce
37
+ set 'mapreduce.map.output.compress', true
38
+ set 'mapreduce.output.fileoutputformat.compress', true
39
+ set 'mapreduce.map.output.compress.codec', 'org.apache.hadoop.io.compress.GzipCodec'
40
+ set 'mapreduce.output.fileoutputformat.compress.codec', 'org.apache.hadoop.io.compress.GzipCodec'
41
+ set 'mapreduce.output.fileoutputformat.compress.type', 'BLOCK'
42
+ else
43
+ set 'mapred.compress.map.output', true
44
+ set 'mapred.output.compress', true
45
+ set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
46
+ set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
47
+ set 'mapred.output.compression.type', 'BLOCK'
48
+ end
41
49
  end
42
50
  end
43
51
 
52
+ def framework
53
+ @framework ||= @job.configuration.get('mapreduce.framework.name') ? :mapreduce : :mapred
54
+ end
55
+
44
56
  def local_mode?
45
- @job.configuration.get('mapred.job.tracker') == 'local'
57
+ property = framework == :mapreduce ? 'mapreduce.framework.name' : 'mapred.job.tracker'
58
+ @job.configuration.get(property) == 'local'
46
59
  end
47
60
 
48
61
  def cache_file(file, options = {})
@@ -56,5 +69,58 @@ module Rubydoop
56
69
  Hadoop::FileCache::DistributedCache.add_cache_file(uri, @job.configuration)
57
70
  end
58
71
  end
72
+
73
+ # Configures the job for secondary sort on the specified slice of the mapper
74
+ # output key.
75
+ #
76
+ # Hadoop comes with a partitioner that can partition the map output based
77
+ # on a slice of the map output key. Humboldt ships with a comparator that
78
+ # uses the same configuration. Together they can be used to implement
79
+ # secondary sort.
80
+ #
81
+ # Secondary sort is a mapreduce pattern where you emit a key, but partition
82
+ # and group only on a subset of that key. This has the result that each
83
+ # reduce invocation will see values grouped by the subset, but ordered by
84
+ # the whole key. It is used, among other things, to efficiently count
85
+ # distinct values.
86
+ #
87
+ # Say you want to count the number of distinct visitors to a site. Your
88
+ # input is pairs of site and visitor IDs. The naïve implementation is to
89
+ # emit the site as key and the visitor ID as value and then, in the reducer,
90
+ # collect all IDs in a set, and emit the site and the size of the set of IDs.
91
+ # This is very memory inefficient, and impractical. For any interesting
92
+ # amount of data you will not be able to keep all the visitor IDs in memory.
93
+ #
94
+ # What you do, instead, is to concatenate the site and visitor ID and emit
95
+ # that as key, and the visitor ID as value. It might seem wasteful to emit
96
+ # the visitor ID twice, but it's necessary since Hadoop will only give you
97
+ # the key for the first value in each group.
98
+ #
99
+ # You then instruct Hadoop to partition and group on just the site part of
100
+ # the key. Hadoop will still sort the values by their full key, so within
101
+ # each group the values will be sorted by visitor ID. In the reducer it's
102
+ # now trivial to loop over the values and just increment a counter each time
103
+ # the visitor ID changes.
104
+ #
105
+ # You configure which part of the key to partition and group by specifying
106
+ # the start and end _indexes_. The reason why they are indexes and not a
107
+ # start index and a length, like Ruby's `String#slice`, is that you also can
108
+ # use negative indexes to count from the end. Negative indexes are useful
109
+ # for example when you don't know how wide the part of the key that you want
110
+ # use is. In the example above if you use the domain to identify sites these
111
+ # can be of different length. If your visitor IDs are 20 characters you can
112
+ # use 0 and -20 as your indexes.
113
+ #
114
+ # @param [Fixnum] start_index The first index of the slice, negative numbers
115
+ # are counted from the end
116
+ # @param [Fixnum] end_index The last index of the slice, negative numbers
117
+ # are counted from the end
118
+ # @see http://hadoop.apache.org/docs/r2.7.1/api/org/apache/hadoop/mapreduce/lib/partition/BinaryPartitioner.html Hadoop's BinaryPartitioner
119
+ def secondary_sort(start_index, end_index)
120
+ @job.set_partitioner_class(Hadoop::Mapreduce::Lib::Partition::BinaryPartitioner)
121
+ Hadoop::Mapreduce::Lib::Partition::BinaryPartitioner.set_offsets(@job.configuration, start_index, end_index)
122
+ @job.set_grouping_comparator_class(Humboldt::JavaLib::BinaryComparator)
123
+ Humboldt::JavaLib::BinaryComparator.set_offsets(@job.configuration, start_index, end_index)
124
+ end
59
125
  end
60
126
  end
data/lib/humboldt/cli.rb CHANGED
@@ -63,6 +63,7 @@ module Humboldt
63
63
  run_command('hadoop', 'jar', project_jar, '-conf', hadoop_config_path, job_config, input_glob, output_path, *options[:extra_hadoop_args])
64
64
  end
65
65
 
66
+ # @deprecated EMR support will be removed in 2.0
66
67
  desc 'run-emr', 'run a job in Elastic MapReduce'
67
68
  method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved against the data bucket'
68
69
  method_option :output, :type => :string, :desc => 'the output directory, defaults to "<project_name>/<job_config>/output" in the job bucket'
@@ -107,6 +108,7 @@ module Humboldt
107
108
  say_status(:started, %{EMR job flow "#{job_flow.job_flow_id}"})
108
109
  end
109
110
 
111
+ # @deprecated EMR support will be removed in 2.0
110
112
  desc 'emr-job', 'show status of the last EMR job'
111
113
  def emr_job
112
114
  if File.exists?('.humboldtjob')
@@ -118,6 +120,7 @@ module Humboldt
118
120
  end
119
121
  end
120
122
 
123
+ # @deprecated EMR support will be removed in 2.0
121
124
  desc 'emr-jobs', 'list all EMR jobs'
122
125
  def emr_jobs
123
126
  emr.job_flows.each do |job_flow|
@@ -1,6 +1,7 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  module Humboldt
4
+ # @deprecated EMR support will be removed in 2.0
4
5
  class EmrFlow
5
6
  attr_reader :output_path
6
7
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  module Humboldt
4
4
  module Patterns
5
+ # @deprecated Use Hadoop's LongSumReducer
5
6
  class SumReducer < Reducer
6
7
  input :text, :long
7
8
  output :text, :long
@@ -4,6 +4,7 @@ require 'zlib'
4
4
 
5
5
 
6
6
  module Humboldt
7
+ # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
7
8
  class BinaryPrefixPartitioner
8
9
  def initialize(cutoff_index)
9
10
  @cutoff_index = cutoff_index
@@ -16,6 +17,7 @@ module Humboldt
16
17
  end
17
18
  end
18
19
 
20
+ # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
19
21
  class DropBinaryPrefixPartitioner < BinaryPrefixPartitioner
20
22
  def partition(key, value, num_partitions)
21
23
  length = key.length > @cutoff_index ? key.length - @cutoff_index : 0
@@ -24,6 +26,7 @@ module Humboldt
24
26
  end
25
27
  end
26
28
 
29
+ # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
27
30
  class BinaryPrefixComparator
28
31
  def initialize(cutoff_index)
29
32
  @cutoff_index = cutoff_index
@@ -36,6 +39,7 @@ module Humboldt
36
39
  end
37
40
  end
38
41
 
42
+ # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
39
43
  class DropBinaryPrefixComparator < BinaryPrefixComparator
40
44
  def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
41
45
  subset_length1 = length1 - @cutoff_index
@@ -1,5 +1,5 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  module Humboldt
4
- VERSION = '1.0.5'.freeze
4
+ VERSION = '1.1.0'.freeze
5
5
  end
data/lib/humboldt.jar CHANGED
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: humboldt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5
4
+ version: 1.1.0
5
5
  platform: java
6
6
  authors:
7
7
  - The Burt Platform Team
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-03 00:00:00.000000000 Z
11
+ date: 2015-10-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement