humboldt 1.0.5-java → 1.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0dd6faa77ad31ace7326eeee615ceba4a3a34508
4
- data.tar.gz: 157bc24062834dbe6d2e0f343d1213c414446ef2
3
+ metadata.gz: 1b68b1b122d8e69538a1ed888cd8ede41e07a4a7
4
+ data.tar.gz: b658d3c93c062116ecec9c1b1f3b09fb26b5ff49
5
5
  SHA512:
6
- metadata.gz: 54d0fcda75d41fbba31402449efa494031202ca76fb7f45ea19494effade4c081c326062e505e9967910320940dba3ecf7b9a15a9d0e917ebf37b31c1234803a
7
- data.tar.gz: f0b075d53128a28cee17cf565e44dd2c110a8f2284ea801fe1e93a7f441dd670c924ce881dfa8bada2c0960bf0fe9846d85d07a2dcd5e762fbff009ef275231e
6
+ metadata.gz: 17780975cfc99cb6f4f4639d23dc3c3336085c5343e37b37841eb7d8eacfc49d7963d5f217d90f9495f4ed8aed1ac36e45419e79d7dddc1cce93aa87b08621c6
7
+ data.tar.gz: 8b79f3d82f416d5180ff657ca01de80b6370b745a6587aec44efcdfb4918196d0d0b61c84f46ca3fbb4ca0d10e249c5cbc1cf64e241aaa7b3c47771ef76dd43c
data/lib/ext/hadoop.rb CHANGED
@@ -7,4 +7,11 @@ module Hadoop
7
7
  module Conf
8
8
  include_package 'org.apache.hadoop.conf'
9
9
  end
10
+ module Mapreduce
11
+ module Lib
12
+ module Partition
13
+ include_package 'org.apache.hadoop.mapreduce.lib.partition'
14
+ end
15
+ end
16
+ end
10
17
  end
data/lib/ext/rubydoop.rb CHANGED
@@ -33,16 +33,29 @@ module Rubydoop
33
33
 
34
34
  def enable_compression!
35
35
  unless local_mode?
36
- set 'mapred.compress.map.output', true
37
- set 'mapred.output.compress', true
38
- set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
39
- set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
40
- set 'mapred.output.compression.type', 'BLOCK'
36
+ if framework == :mapreduce
37
+ set 'mapreduce.map.output.compress', true
38
+ set 'mapreduce.output.fileoutputformat.compress', true
39
+ set 'mapreduce.map.output.compress.codec', 'org.apache.hadoop.io.compress.GzipCodec'
40
+ set 'mapreduce.output.fileoutputformat.compress.codec', 'org.apache.hadoop.io.compress.GzipCodec'
41
+ set 'mapreduce.output.fileoutputformat.compress.type', 'BLOCK'
42
+ else
43
+ set 'mapred.compress.map.output', true
44
+ set 'mapred.output.compress', true
45
+ set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
46
+ set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
47
+ set 'mapred.output.compression.type', 'BLOCK'
48
+ end
41
49
  end
42
50
  end
43
51
 
52
+ def framework
53
+ @framework ||= @job.configuration.get('mapreduce.framework.name') ? :mapreduce : :mapred
54
+ end
55
+
44
56
  def local_mode?
45
- @job.configuration.get('mapred.job.tracker') == 'local'
57
+ property = framework == :mapreduce ? 'mapreduce.framework.name' : 'mapred.job.tracker'
58
+ @job.configuration.get(property) == 'local'
46
59
  end
47
60
 
48
61
  def cache_file(file, options = {})
@@ -56,5 +69,58 @@ module Rubydoop
56
69
  Hadoop::FileCache::DistributedCache.add_cache_file(uri, @job.configuration)
57
70
  end
58
71
  end
72
+
73
+ # Configures the job for secondary sort on the specified slice of the mapper
74
+ # output key.
75
+ #
76
+ # Hadoop comes with a partitioner that can partition the map output based
77
+ # on a slice of the map output key. Humboldt ships with a comparator that
78
+ # uses the same configuration. Together they can be used to implement
79
+ # secondary sort.
80
+ #
81
+ # Secondary sort is a mapreduce pattern where you emit a key, but partition
82
+ # and group only on a subset of that key. This has the result that each
83
+ # reduce invocation will see values grouped by the subset, but ordered by
84
+ # the whole key. It is used, among other things, to efficiently count
85
+ # distinct values.
86
+ #
87
+ # Say you want to count the number of distinct visitors to a site. Your
88
+ # input is pairs of site and visitor IDs. The naïve implementation is to
89
+ # emit the site as key and the visitor ID as value and then, in the reducer,
90
+ # collect all IDs in a set, and emit the site and the size of the set of IDs.
91
+ # This is very memory inefficient, and impractical. For any interesting
92
+ # amount of data you will not be able to keep all the visitor IDs in memory.
93
+ #
94
+ # What you do, instead, is to concatenate the site and visitor ID and emit
95
+ # that as key, and the visitor ID as value. It might seem wasteful to emit
96
+ # the visitor ID twice, but it's necessary since Hadoop will only give you
97
+ # the key for the first value in each group.
98
+ #
99
+ # You then instruct Hadoop to partition and group on just the site part of
100
+ # the key. Hadoop will still sort the values by their full key, so within
101
+ # each group the values will be sorted by visitor ID. In the reducer it's
102
+ # now trivial to loop over the values and just increment a counter each time
103
+ # the visitor ID changes.
104
+ #
105
+ # You configure which part of the key to partition and group by specifying
106
+ # the start and end _indexes_. The reason why they are indexes and not a
107
+ # start index and a length, like Ruby's `String#slice`, is that you also can
108
+ # use negative indexes to count from the end. Negative indexes are useful
109
+ # for example when you don't know how wide the part of the key that you want
110
+ # use is. In the example above if you use the domain to identify sites these
111
+ # can be of different length. If your visitor IDs are 20 characters you can
112
+ # use 0 and -20 as your indexes.
113
+ #
114
+ # @param [Fixnum] start_index The first index of the slice, negative numbers
115
+ # are counted from the end
116
+ # @param [Fixnum] end_index The last index of the slice, negative numbers
117
+ # are counted from the end
118
+ # @see http://hadoop.apache.org/docs/r2.7.1/api/org/apache/hadoop/mapreduce/lib/partition/BinaryPartitioner.html Hadoop's BinaryPartitioner
119
+ def secondary_sort(start_index, end_index)
120
+ @job.set_partitioner_class(Hadoop::Mapreduce::Lib::Partition::BinaryPartitioner)
121
+ Hadoop::Mapreduce::Lib::Partition::BinaryPartitioner.set_offsets(@job.configuration, start_index, end_index)
122
+ @job.set_grouping_comparator_class(Humboldt::JavaLib::BinaryComparator)
123
+ Humboldt::JavaLib::BinaryComparator.set_offsets(@job.configuration, start_index, end_index)
124
+ end
59
125
  end
60
126
  end
data/lib/humboldt/cli.rb CHANGED
@@ -63,6 +63,7 @@ module Humboldt
63
63
  run_command('hadoop', 'jar', project_jar, '-conf', hadoop_config_path, job_config, input_glob, output_path, *options[:extra_hadoop_args])
64
64
  end
65
65
 
66
+ # @deprecated EMR support will be removed in 2.0
66
67
  desc 'run-emr', 'run a job in Elastic MapReduce'
67
68
  method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved against the data bucket'
68
69
  method_option :output, :type => :string, :desc => 'the output directory, defaults to "<project_name>/<job_config>/output" in the job bucket'
@@ -107,6 +108,7 @@ module Humboldt
107
108
  say_status(:started, %{EMR job flow "#{job_flow.job_flow_id}"})
108
109
  end
109
110
 
111
+ # @deprecated EMR support will be removed in 2.0
110
112
  desc 'emr-job', 'show status of the last EMR job'
111
113
  def emr_job
112
114
  if File.exists?('.humboldtjob')
@@ -118,6 +120,7 @@ module Humboldt
118
120
  end
119
121
  end
120
122
 
123
+ # @deprecated EMR support will be removed in 2.0
121
124
  desc 'emr-jobs', 'list all EMR jobs'
122
125
  def emr_jobs
123
126
  emr.job_flows.each do |job_flow|
@@ -1,6 +1,7 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  module Humboldt
4
+ # @deprecated EMR support will be removed in 2.0
4
5
  class EmrFlow
5
6
  attr_reader :output_path
6
7
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  module Humboldt
4
4
  module Patterns
5
+ # @deprecated Use Hadoop's LongSumReducer
5
6
  class SumReducer < Reducer
6
7
  input :text, :long
7
8
  output :text, :long
@@ -4,6 +4,7 @@ require 'zlib'
4
4
 
5
5
 
6
6
  module Humboldt
7
+ # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
7
8
  class BinaryPrefixPartitioner
8
9
  def initialize(cutoff_index)
9
10
  @cutoff_index = cutoff_index
@@ -16,6 +17,7 @@ module Humboldt
16
17
  end
17
18
  end
18
19
 
20
+ # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
19
21
  class DropBinaryPrefixPartitioner < BinaryPrefixPartitioner
20
22
  def partition(key, value, num_partitions)
21
23
  length = key.length > @cutoff_index ? key.length - @cutoff_index : 0
@@ -24,6 +26,7 @@ module Humboldt
24
26
  end
25
27
  end
26
28
 
29
+ # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
27
30
  class BinaryPrefixComparator
28
31
  def initialize(cutoff_index)
29
32
  @cutoff_index = cutoff_index
@@ -36,6 +39,7 @@ module Humboldt
36
39
  end
37
40
  end
38
41
 
42
+ # @deprecated Use {Rubydoop::JobDescription::secondary_sort}
39
43
  class DropBinaryPrefixComparator < BinaryPrefixComparator
40
44
  def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
41
45
  subset_length1 = length1 - @cutoff_index
@@ -1,5 +1,5 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  module Humboldt
4
- VERSION = '1.0.5'.freeze
4
+ VERSION = '1.1.0'.freeze
5
5
  end
data/lib/humboldt.jar CHANGED
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: humboldt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.5
4
+ version: 1.1.0
5
5
  platform: java
6
6
  authors:
7
7
  - The Burt Platform Team
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-03 00:00:00.000000000 Z
11
+ date: 2015-10-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement