humboldt 1.0.5-java → 1.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/ext/hadoop.rb +7 -0
- data/lib/ext/rubydoop.rb +72 -6
- data/lib/humboldt/cli.rb +3 -0
- data/lib/humboldt/emr_flow.rb +1 -0
- data/lib/humboldt/patterns/sum_reducer.rb +1 -0
- data/lib/humboldt/prefix_grouping.rb +4 -0
- data/lib/humboldt/version.rb +1 -1
- data/lib/humboldt.jar +0 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1b68b1b122d8e69538a1ed888cd8ede41e07a4a7
|
4
|
+
data.tar.gz: b658d3c93c062116ecec9c1b1f3b09fb26b5ff49
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17780975cfc99cb6f4f4639d23dc3c3336085c5343e37b37841eb7d8eacfc49d7963d5f217d90f9495f4ed8aed1ac36e45419e79d7dddc1cce93aa87b08621c6
|
7
|
+
data.tar.gz: 8b79f3d82f416d5180ff657ca01de80b6370b745a6587aec44efcdfb4918196d0d0b61c84f46ca3fbb4ca0d10e249c5cbc1cf64e241aaa7b3c47771ef76dd43c
|
data/lib/ext/hadoop.rb
CHANGED
data/lib/ext/rubydoop.rb
CHANGED
@@ -33,16 +33,29 @@ module Rubydoop
|
|
33
33
|
|
34
34
|
def enable_compression!
|
35
35
|
unless local_mode?
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
36
|
+
if framework == :mapreduce
|
37
|
+
set 'mapreduce.map.output.compress', true
|
38
|
+
set 'mapreduce.output.fileoutputformat.compress', true
|
39
|
+
set 'mapreduce.map.output.compress.codec', 'org.apache.hadoop.io.compress.GzipCodec'
|
40
|
+
set 'mapreduce.output.fileoutputformat.compress.codec', 'org.apache.hadoop.io.compress.GzipCodec'
|
41
|
+
set 'mapreduce.output.fileoutputformat.compress.type', 'BLOCK'
|
42
|
+
else
|
43
|
+
set 'mapred.compress.map.output', true
|
44
|
+
set 'mapred.output.compress', true
|
45
|
+
set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
|
46
|
+
set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
|
47
|
+
set 'mapred.output.compression.type', 'BLOCK'
|
48
|
+
end
|
41
49
|
end
|
42
50
|
end
|
43
51
|
|
52
|
+
def framework
|
53
|
+
@framework ||= @job.configuration.get('mapreduce.framework.name') ? :mapreduce : :mapred
|
54
|
+
end
|
55
|
+
|
44
56
|
def local_mode?
|
45
|
-
|
57
|
+
property = framework == :mapreduce ? 'mapreduce.framework.name' : 'mapred.job.tracker'
|
58
|
+
@job.configuration.get(property) == 'local'
|
46
59
|
end
|
47
60
|
|
48
61
|
def cache_file(file, options = {})
|
@@ -56,5 +69,58 @@ module Rubydoop
|
|
56
69
|
Hadoop::FileCache::DistributedCache.add_cache_file(uri, @job.configuration)
|
57
70
|
end
|
58
71
|
end
|
72
|
+
|
73
|
+
# Configures the job for secondary sort on the specified slice of the mapper
|
74
|
+
# output key.
|
75
|
+
#
|
76
|
+
# Hadoop comes with a partitioner that can partition the map output based
|
77
|
+
# on a slice of the map output key. Humboldt ships with a comparator that
|
78
|
+
# uses the same configuration. Together they can be used to implement
|
79
|
+
# secondary sort.
|
80
|
+
#
|
81
|
+
# Secondary sort is a mapreduce pattern where you emit a key, but partition
|
82
|
+
# and group only on a subset of that key. This has the result that each
|
83
|
+
# reduce invocation will see values grouped by the subset, but ordered by
|
84
|
+
# the whole key. It is used, among other things, to efficiently count
|
85
|
+
# distinct values.
|
86
|
+
#
|
87
|
+
# Say you want to count the number of distinct visitors to a site. Your
|
88
|
+
# input is pairs of site and visitor IDs. The naïve implementation is to
|
89
|
+
# emit the site as key and the visitor ID as value and then, in the reducer,
|
90
|
+
# collect all IDs in a set, and emit the site and the size of the set of IDs.
|
91
|
+
# This is very memory inefficient, and impractical. For any interesting
|
92
|
+
# amount of data you will not be able to keep all the visitor IDs in memory.
|
93
|
+
#
|
94
|
+
# What you do, instead, is to concatenate the site and visitor ID and emit
|
95
|
+
# that as key, and the visitor ID as value. It might seem wasteful to emit
|
96
|
+
# the visitor ID twice, but it's necessary since Hadoop will only give you
|
97
|
+
# the key for the first value in each group.
|
98
|
+
#
|
99
|
+
# You then instruct Hadoop to partition and group on just the site part of
|
100
|
+
# the key. Hadoop will still sort the values by their full key, so within
|
101
|
+
# each group the values will be sorted by visitor ID. In the reducer it's
|
102
|
+
# now trivial to loop over the values and just increment a counter each time
|
103
|
+
# the visitor ID changes.
|
104
|
+
#
|
105
|
+
# You configure which part of the key to partition and group by specifying
|
106
|
+
# the start and end _indexes_. The reason why they are indexes and not a
|
107
|
+
# start index and a length, like Ruby's `String#slice`, is that you also can
|
108
|
+
# use negative indexes to count from the end. Negative indexes are useful
|
109
|
+
# for example when you don't know how wide the part of the key that you want
|
110
|
+
# use is. In the example above if you use the domain to identify sites these
|
111
|
+
# can be of different length. If your visitor IDs are 20 characters you can
|
112
|
+
# use 0 and -20 as your indexes.
|
113
|
+
#
|
114
|
+
# @param [Fixnum] start_index The first index of the slice, negative numbers
|
115
|
+
# are counted from the end
|
116
|
+
# @param [Fixnum] end_index The last index of the slice, negative numbers
|
117
|
+
# are counted from the end
|
118
|
+
# @see http://hadoop.apache.org/docs/r2.7.1/api/org/apache/hadoop/mapreduce/lib/partition/BinaryPartitioner.html Hadoop's BinaryPartitioner
|
119
|
+
def secondary_sort(start_index, end_index)
|
120
|
+
@job.set_partitioner_class(Hadoop::Mapreduce::Lib::Partition::BinaryPartitioner)
|
121
|
+
Hadoop::Mapreduce::Lib::Partition::BinaryPartitioner.set_offsets(@job.configuration, start_index, end_index)
|
122
|
+
@job.set_grouping_comparator_class(Humboldt::JavaLib::BinaryComparator)
|
123
|
+
Humboldt::JavaLib::BinaryComparator.set_offsets(@job.configuration, start_index, end_index)
|
124
|
+
end
|
59
125
|
end
|
60
126
|
end
|
data/lib/humboldt/cli.rb
CHANGED
@@ -63,6 +63,7 @@ module Humboldt
|
|
63
63
|
run_command('hadoop', 'jar', project_jar, '-conf', hadoop_config_path, job_config, input_glob, output_path, *options[:extra_hadoop_args])
|
64
64
|
end
|
65
65
|
|
66
|
+
# @deprecated EMR support will be removed in 2.0
|
66
67
|
desc 'run-emr', 'run a job in Elastic MapReduce'
|
67
68
|
method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved against the data bucket'
|
68
69
|
method_option :output, :type => :string, :desc => 'the output directory, defaults to "<project_name>/<job_config>/output" in the job bucket'
|
@@ -107,6 +108,7 @@ module Humboldt
|
|
107
108
|
say_status(:started, %{EMR job flow "#{job_flow.job_flow_id}"})
|
108
109
|
end
|
109
110
|
|
111
|
+
# @deprecated EMR support will be removed in 2.0
|
110
112
|
desc 'emr-job', 'show status of the last EMR job'
|
111
113
|
def emr_job
|
112
114
|
if File.exists?('.humboldtjob')
|
@@ -118,6 +120,7 @@ module Humboldt
|
|
118
120
|
end
|
119
121
|
end
|
120
122
|
|
123
|
+
# @deprecated EMR support will be removed in 2.0
|
121
124
|
desc 'emr-jobs', 'list all EMR jobs'
|
122
125
|
def emr_jobs
|
123
126
|
emr.job_flows.each do |job_flow|
|
data/lib/humboldt/emr_flow.rb
CHANGED
@@ -4,6 +4,7 @@ require 'zlib'
|
|
4
4
|
|
5
5
|
|
6
6
|
module Humboldt
|
7
|
+
# @deprecated Use {Rubydoop::JobDescription::secondary_sort}
|
7
8
|
class BinaryPrefixPartitioner
|
8
9
|
def initialize(cutoff_index)
|
9
10
|
@cutoff_index = cutoff_index
|
@@ -16,6 +17,7 @@ module Humboldt
|
|
16
17
|
end
|
17
18
|
end
|
18
19
|
|
20
|
+
# @deprecated Use {Rubydoop::JobDescription::secondary_sort}
|
19
21
|
class DropBinaryPrefixPartitioner < BinaryPrefixPartitioner
|
20
22
|
def partition(key, value, num_partitions)
|
21
23
|
length = key.length > @cutoff_index ? key.length - @cutoff_index : 0
|
@@ -24,6 +26,7 @@ module Humboldt
|
|
24
26
|
end
|
25
27
|
end
|
26
28
|
|
29
|
+
# @deprecated Use {Rubydoop::JobDescription::secondary_sort}
|
27
30
|
class BinaryPrefixComparator
|
28
31
|
def initialize(cutoff_index)
|
29
32
|
@cutoff_index = cutoff_index
|
@@ -36,6 +39,7 @@ module Humboldt
|
|
36
39
|
end
|
37
40
|
end
|
38
41
|
|
42
|
+
# @deprecated Use {Rubydoop::JobDescription::secondary_sort}
|
39
43
|
class DropBinaryPrefixComparator < BinaryPrefixComparator
|
40
44
|
def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
|
41
45
|
subset_length1 = length1 - @cutoff_index
|
data/lib/humboldt/version.rb
CHANGED
data/lib/humboldt.jar
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: humboldt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- The Burt Platform Team
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-10-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|