humboldt 1.0.5-java → 1.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ext/hadoop.rb +7 -0
- data/lib/ext/rubydoop.rb +72 -6
- data/lib/humboldt/cli.rb +3 -0
- data/lib/humboldt/emr_flow.rb +1 -0
- data/lib/humboldt/patterns/sum_reducer.rb +1 -0
- data/lib/humboldt/prefix_grouping.rb +4 -0
- data/lib/humboldt/version.rb +1 -1
- data/lib/humboldt.jar +0 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1b68b1b122d8e69538a1ed888cd8ede41e07a4a7
|
4
|
+
data.tar.gz: b658d3c93c062116ecec9c1b1f3b09fb26b5ff49
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17780975cfc99cb6f4f4639d23dc3c3336085c5343e37b37841eb7d8eacfc49d7963d5f217d90f9495f4ed8aed1ac36e45419e79d7dddc1cce93aa87b08621c6
|
7
|
+
data.tar.gz: 8b79f3d82f416d5180ff657ca01de80b6370b745a6587aec44efcdfb4918196d0d0b61c84f46ca3fbb4ca0d10e249c5cbc1cf64e241aaa7b3c47771ef76dd43c
|
data/lib/ext/hadoop.rb
CHANGED
data/lib/ext/rubydoop.rb
CHANGED
@@ -33,16 +33,29 @@ module Rubydoop
|
|
33
33
|
|
34
34
|
def enable_compression!
|
35
35
|
unless local_mode?
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
36
|
+
if framework == :mapreduce
|
37
|
+
set 'mapreduce.map.output.compress', true
|
38
|
+
set 'mapreduce.output.fileoutputformat.compress', true
|
39
|
+
set 'mapreduce.map.output.compress.codec', 'org.apache.hadoop.io.compress.GzipCodec'
|
40
|
+
set 'mapreduce.output.fileoutputformat.compress.codec', 'org.apache.hadoop.io.compress.GzipCodec'
|
41
|
+
set 'mapreduce.output.fileoutputformat.compress.type', 'BLOCK'
|
42
|
+
else
|
43
|
+
set 'mapred.compress.map.output', true
|
44
|
+
set 'mapred.output.compress', true
|
45
|
+
set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
|
46
|
+
set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
|
47
|
+
set 'mapred.output.compression.type', 'BLOCK'
|
48
|
+
end
|
41
49
|
end
|
42
50
|
end
|
43
51
|
|
52
|
+
def framework
|
53
|
+
@framework ||= @job.configuration.get('mapreduce.framework.name') ? :mapreduce : :mapred
|
54
|
+
end
|
55
|
+
|
44
56
|
def local_mode?
|
45
|
-
|
57
|
+
property = framework == :mapreduce ? 'mapreduce.framework.name' : 'mapred.job.tracker'
|
58
|
+
@job.configuration.get(property) == 'local'
|
46
59
|
end
|
47
60
|
|
48
61
|
def cache_file(file, options = {})
|
@@ -56,5 +69,58 @@ module Rubydoop
|
|
56
69
|
Hadoop::FileCache::DistributedCache.add_cache_file(uri, @job.configuration)
|
57
70
|
end
|
58
71
|
end
|
72
|
+
|
73
|
+
# Configures the job for secondary sort on the specified slice of the mapper
|
74
|
+
# output key.
|
75
|
+
#
|
76
|
+
# Hadoop comes with a partitioner that can partition the map output based
|
77
|
+
# on a slice of the map output key. Humboldt ships with a comparator that
|
78
|
+
# uses the same configuration. Together they can be used to implement
|
79
|
+
# secondary sort.
|
80
|
+
#
|
81
|
+
# Secondary sort is a mapreduce pattern where you emit a key, but partition
|
82
|
+
# and group only on a subset of that key. This has the result that each
|
83
|
+
# reduce invocation will see values grouped by the subset, but ordered by
|
84
|
+
# the whole key. It is used, among other things, to efficiently count
|
85
|
+
# distinct values.
|
86
|
+
#
|
87
|
+
# Say you want to count the number of distinct visitors to a site. Your
|
88
|
+
# input is pairs of site and visitor IDs. The naïve implementation is to
|
89
|
+
# emit the site as key and the visitor ID as value and then, in the reducer,
|
90
|
+
# collect all IDs in a set, and emit the site and the size of the set of IDs.
|
91
|
+
# This is very memory inefficient, and impractical. For any interesting
|
92
|
+
# amount of data you will not be able to keep all the visitor IDs in memory.
|
93
|
+
#
|
94
|
+
# What you do, instead, is to concatenate the site and visitor ID and emit
|
95
|
+
# that as key, and the visitor ID as value. It might seem wasteful to emit
|
96
|
+
# the visitor ID twice, but it's necessary since Hadoop will only give you
|
97
|
+
# the key for the first value in each group.
|
98
|
+
#
|
99
|
+
# You then instruct Hadoop to partition and group on just the site part of
|
100
|
+
# the key. Hadoop will still sort the values by their full key, so within
|
101
|
+
# each group the values will be sorted by visitor ID. In the reducer it's
|
102
|
+
# now trivial to loop over the values and just increment a counter each time
|
103
|
+
# the visitor ID changes.
|
104
|
+
#
|
105
|
+
# You configure which part of the key to partition and group by specifying
|
106
|
+
# the start and end _indexes_. The reason why they are indexes and not a
|
107
|
+
# start index and a length, like Ruby's `String#slice`, is that you also can
|
108
|
+
# use negative indexes to count from the end. Negative indexes are useful
|
109
|
+
# for example when you don't know how wide the part of the key that you want
|
110
|
+
# use is. In the example above if you use the domain to identify sites these
|
111
|
+
# can be of different length. If your visitor IDs are 20 characters you can
|
112
|
+
# use 0 and -20 as your indexes.
|
113
|
+
#
|
114
|
+
# @param [Fixnum] start_index The first index of the slice, negative numbers
|
115
|
+
# are counted from the end
|
116
|
+
# @param [Fixnum] end_index The last index of the slice, negative numbers
|
117
|
+
# are counted from the end
|
118
|
+
# @see http://hadoop.apache.org/docs/r2.7.1/api/org/apache/hadoop/mapreduce/lib/partition/BinaryPartitioner.html Hadoop's BinaryPartitioner
|
119
|
+
def secondary_sort(start_index, end_index)
|
120
|
+
@job.set_partitioner_class(Hadoop::Mapreduce::Lib::Partition::BinaryPartitioner)
|
121
|
+
Hadoop::Mapreduce::Lib::Partition::BinaryPartitioner.set_offsets(@job.configuration, start_index, end_index)
|
122
|
+
@job.set_grouping_comparator_class(Humboldt::JavaLib::BinaryComparator)
|
123
|
+
Humboldt::JavaLib::BinaryComparator.set_offsets(@job.configuration, start_index, end_index)
|
124
|
+
end
|
59
125
|
end
|
60
126
|
end
|
data/lib/humboldt/cli.rb
CHANGED
@@ -63,6 +63,7 @@ module Humboldt
|
|
63
63
|
run_command('hadoop', 'jar', project_jar, '-conf', hadoop_config_path, job_config, input_glob, output_path, *options[:extra_hadoop_args])
|
64
64
|
end
|
65
65
|
|
66
|
+
# @deprecated EMR support will be removed in 2.0
|
66
67
|
desc 'run-emr', 'run a job in Elastic MapReduce'
|
67
68
|
method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved against the data bucket'
|
68
69
|
method_option :output, :type => :string, :desc => 'the output directory, defaults to "<project_name>/<job_config>/output" in the job bucket'
|
@@ -107,6 +108,7 @@ module Humboldt
|
|
107
108
|
say_status(:started, %{EMR job flow "#{job_flow.job_flow_id}"})
|
108
109
|
end
|
109
110
|
|
111
|
+
# @deprecated EMR support will be removed in 2.0
|
110
112
|
desc 'emr-job', 'show status of the last EMR job'
|
111
113
|
def emr_job
|
112
114
|
if File.exists?('.humboldtjob')
|
@@ -118,6 +120,7 @@ module Humboldt
|
|
118
120
|
end
|
119
121
|
end
|
120
122
|
|
123
|
+
# @deprecated EMR support will be removed in 2.0
|
121
124
|
desc 'emr-jobs', 'list all EMR jobs'
|
122
125
|
def emr_jobs
|
123
126
|
emr.job_flows.each do |job_flow|
|
data/lib/humboldt/emr_flow.rb
CHANGED
@@ -4,6 +4,7 @@ require 'zlib'
|
|
4
4
|
|
5
5
|
|
6
6
|
module Humboldt
|
7
|
+
# @deprecated Use {Rubydoop::JobDescription::secondary_sort}
|
7
8
|
class BinaryPrefixPartitioner
|
8
9
|
def initialize(cutoff_index)
|
9
10
|
@cutoff_index = cutoff_index
|
@@ -16,6 +17,7 @@ module Humboldt
|
|
16
17
|
end
|
17
18
|
end
|
18
19
|
|
20
|
+
# @deprecated Use {Rubydoop::JobDescription::secondary_sort}
|
19
21
|
class DropBinaryPrefixPartitioner < BinaryPrefixPartitioner
|
20
22
|
def partition(key, value, num_partitions)
|
21
23
|
length = key.length > @cutoff_index ? key.length - @cutoff_index : 0
|
@@ -24,6 +26,7 @@ module Humboldt
|
|
24
26
|
end
|
25
27
|
end
|
26
28
|
|
29
|
+
# @deprecated Use {Rubydoop::JobDescription::secondary_sort}
|
27
30
|
class BinaryPrefixComparator
|
28
31
|
def initialize(cutoff_index)
|
29
32
|
@cutoff_index = cutoff_index
|
@@ -36,6 +39,7 @@ module Humboldt
|
|
36
39
|
end
|
37
40
|
end
|
38
41
|
|
42
|
+
# @deprecated Use {Rubydoop::JobDescription::secondary_sort}
|
39
43
|
class DropBinaryPrefixComparator < BinaryPrefixComparator
|
40
44
|
def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
|
41
45
|
subset_length1 = length1 - @cutoff_index
|
data/lib/humboldt/version.rb
CHANGED
data/lib/humboldt.jar
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: humboldt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- The Burt Platform Team
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-10-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|