humboldt 1.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/humboldt +8 -0
- data/config/emr-bootstrap/remove_old_jruby.sh +11 -0
- data/config/hadoop-local.xml +12 -0
- data/lib/ext/hadoop.rb +10 -0
- data/lib/ext/rubydoop.rb +60 -0
- data/lib/humboldt/cli.rb +263 -0
- data/lib/humboldt/emr_flow.rb +198 -0
- data/lib/humboldt/hadoop_status_filter.rb +97 -0
- data/lib/humboldt/java_lib.rb +5 -0
- data/lib/humboldt/mapper.rb +15 -0
- data/lib/humboldt/patterns/sum_reducer.rb +16 -0
- data/lib/humboldt/prefix_grouping.rb +46 -0
- data/lib/humboldt/processor.rb +96 -0
- data/lib/humboldt/reducer.rb +34 -0
- data/lib/humboldt/rspec.rb +100 -0
- data/lib/humboldt/type_converters.rb +180 -0
- data/lib/humboldt/version.rb +5 -0
- data/lib/humboldt.jar +0 -0
- data/lib/humboldt.rb +16 -0
- metadata +112 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7ca8c0825572a27fc4673c042f4d5677e250340a
|
4
|
+
data.tar.gz: 910d598f4df79e42eda1bbf4d022db9a160c48c3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6cf04eb473b93684dfde74f56ed6113d9c8d15fbfcc7686b33d878570a804ee5c29c11e0fdc29692b689f9b5039e8f0a21a93781d669c2ae4d4dc8384f7d436f
|
7
|
+
data.tar.gz: 84d6fab2395c97ebda5eac18684371ee475ee336823a4cb936f11196b8905911436af418bfb3386649c6c3c2097095c38a1aab2ada827f56e33f8020c50e6f38
|
data/bin/humboldt
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
if [ -e /home/hadoop/lib/jruby-complete-no-joda-1.6.5.jar ]
|
4
|
+
then
|
5
|
+
rm /home/hadoop/lib/jruby-complete-no-joda-1.6.5.jar
|
6
|
+
fi
|
7
|
+
|
8
|
+
if [ -e /home/hadoop/lib/jruby-complete-1.6.8.jar ]
|
9
|
+
then
|
10
|
+
rm /home/hadoop/lib/jruby-complete-1.6.8.jar
|
11
|
+
fi
|
data/lib/ext/hadoop.rb
ADDED
data/lib/ext/rubydoop.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Rubydoop
|
4
|
+
class JobDefinition
|
5
|
+
alias mapperrr mapper
|
6
|
+
def mapper(cls)
|
7
|
+
map_output_key cls.output_key.const_get(:HADOOP) if cls.respond_to?(:output_key)
|
8
|
+
map_output_value cls.output_value.const_get(:HADOOP) if cls.respond_to?(:output_value)
|
9
|
+
mapperrr cls
|
10
|
+
end
|
11
|
+
|
12
|
+
alias reducerrr reducer
|
13
|
+
def reducer(cls)
|
14
|
+
output_key cls.output_key.const_get(:HADOOP) if cls.respond_to?(:output_key)
|
15
|
+
output_value cls.output_value.const_get(:HADOOP) if cls.respond_to?(:output_value)
|
16
|
+
reducerrr cls
|
17
|
+
end
|
18
|
+
|
19
|
+
alias inputtt input
|
20
|
+
def input(paths, options={})
|
21
|
+
options = options.dup
|
22
|
+
format = options[:format]
|
23
|
+
STDERR.puts "Warning! Using `format: :combined_text` will not work with remote input paths (e.g. S3) and Hadoop 1.x. Cf. https://issues.apache.org/jira/browse/MAPREDUCE-1806" if format == :combined_text
|
24
|
+
unless format.nil? or format.is_a?(Class)
|
25
|
+
class_name = format.to_s.gsub(/^.|_./) {|x| x[-1,1].upcase } + "InputFormat"
|
26
|
+
begin
|
27
|
+
options[:format] = Humboldt::JavaLib.const_get(class_name)
|
28
|
+
rescue NameError
|
29
|
+
end
|
30
|
+
end
|
31
|
+
inputtt(paths, options)
|
32
|
+
end
|
33
|
+
|
34
|
+
def enable_compression!
|
35
|
+
unless local_mode?
|
36
|
+
set 'mapred.compress.map.output', true
|
37
|
+
set 'mapred.output.compress', true
|
38
|
+
set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
|
39
|
+
set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
|
40
|
+
set 'mapred.output.compression.type', 'BLOCK'
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def local_mode?
|
45
|
+
@job.configuration.get('mapred.job.tracker') == 'local'
|
46
|
+
end
|
47
|
+
|
48
|
+
def cache_file(file, options = {})
|
49
|
+
symlink = options.fetch(:as, File.basename(file))
|
50
|
+
if local_mode? && !Hadoop::Mapreduce::Job.instance_methods.include?(:add_cache_file)
|
51
|
+
unless File.symlink?(symlink) && File.readlink(symlink) == file
|
52
|
+
FileUtils.ln_s file, symlink
|
53
|
+
end
|
54
|
+
else
|
55
|
+
uri = java.net.URI.new("#{file}\##{symlink}")
|
56
|
+
Hadoop::FileCache::DistributedCache.add_cache_file(uri, @job.configuration)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
data/lib/humboldt/cli.rb
ADDED
@@ -0,0 +1,263 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
require 'aws'
|
5
|
+
require 'open3'
|
6
|
+
require 'rubydoop/package' # this prints an annoying warning in JRuby 1.7.0.RC1
|
7
|
+
require 'humboldt/emr_flow'
|
8
|
+
require 'humboldt/hadoop_status_filter'
|
9
|
+
|
10
|
+
|
11
|
+
|
12
|
+
module Humboldt
|
13
|
+
class Cli < Thor
|
14
|
+
include Thor::Actions
|
15
|
+
|
16
|
+
DEFAULTS = {
|
17
|
+
data_path: 'data/completes',
|
18
|
+
silent: true,
|
19
|
+
skip_package: false,
|
20
|
+
extra_hadoop_args: [],
|
21
|
+
cleanup_before: false,
|
22
|
+
instance_count: 4,
|
23
|
+
instance_type: 'c1.xlarge',
|
24
|
+
spot_instances: nil,
|
25
|
+
bid_price: 0.2,
|
26
|
+
poll: false,
|
27
|
+
skip_prepare: false,
|
28
|
+
aws_region: 'eu-west-1',
|
29
|
+
hadoop_version: '1.0.3'
|
30
|
+
}
|
31
|
+
|
32
|
+
desc 'package', 'Package job JAR file'
|
33
|
+
def package
|
34
|
+
say_status(:package, relative_path(job_package.jar_path))
|
35
|
+
job_package.create!
|
36
|
+
end
|
37
|
+
|
38
|
+
desc 'run-local', 'run a job in local mode with the hadoop command'
|
39
|
+
method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved agains the data path'
|
40
|
+
method_option :output, :type => :string, :desc => 'the output directory, defaults to "data/<job_config>/output"'
|
41
|
+
method_option :job_config, :type => 'string', :desc => 'the name of the Ruby file containing the job configuration, defaults to the project name (e.g. "lib/<job_config>.rb")'
|
42
|
+
method_option :hadoop_config, :type => 'string', :desc => 'the path to a Hadoop configuration XML file, defaults to Humboldt-provided config that runs Hadoop in local-mode'
|
43
|
+
method_option :cleanup_before, :type => :boolean, :desc => "automatically remove the output dir before launching (default: #{DEFAULTS[:cleanup_before]})"
|
44
|
+
method_option :data_path, :type => :string, :desc => "input paths will be resolved against this path (default: #{DEFAULTS[:data_path]})"
|
45
|
+
method_option :silent, :type => :boolean, :desc => "silence the hadoop command's logging (default: #{DEFAULTS[:silent]})"
|
46
|
+
method_option :skip_package, :type => :boolean, :desc => "don't package the JAR, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_package]})"
|
47
|
+
method_option :extra_hadoop_args, :type => :array, :desc => "extra arguments to on pass to hadoop (default: #{DEFAULTS[:extra_hadoop_args]})"
|
48
|
+
def run_local
|
49
|
+
check_job!
|
50
|
+
invoke(:package, [], {}) unless options.skip_package?
|
51
|
+
output_path = options[:output] || "data/#{job_config}/output"
|
52
|
+
output_path_parent = File.dirname(output_path)
|
53
|
+
if options.cleanup_before?
|
54
|
+
remove_file(output_path)
|
55
|
+
else
|
56
|
+
check_local_output!(output_path)
|
57
|
+
end
|
58
|
+
unless File.exists?(output_path_parent)
|
59
|
+
empty_directory(output_path_parent)
|
60
|
+
end
|
61
|
+
input_glob = File.join(options[:data_path], options[:input])
|
62
|
+
hadoop_config_path = options[:hadoop_config] || default_hadoop_config_path
|
63
|
+
run_command('hadoop', 'jar', project_jar, '-conf', hadoop_config_path, job_config, input_glob, output_path, *options[:extra_hadoop_args])
|
64
|
+
end
|
65
|
+
|
66
|
+
desc 'run-emr', 'run a job in Elastic MapReduce'
|
67
|
+
method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved against the data bucket'
|
68
|
+
method_option :output, :type => :string, :desc => 'the output directory, defaults to "<project_name>/<job_config>/output" in the job bucket'
|
69
|
+
method_option :job_config, :type => 'string', :desc => 'the name of the Ruby file containing the job configuration, defaults to the project name (e.g. "lib/<job_config>.rb")'
|
70
|
+
method_option :cleanup_before, :type => :boolean, :desc => "automatically remove the output dir before launching (default: #{DEFAULTS[:cleanup_before]})"
|
71
|
+
method_option :data_bucket, :type => :string, :desc => "S3 bucket containing input data (default: #{DEFAULTS[:data_bucket]})"
|
72
|
+
method_option :job_bucket, :type => :string, :desc => "S3 bucket to upload JAR, output logs and results into (default: #{DEFAULTS[:job_bucket]})"
|
73
|
+
method_option :instance_count, :type => :numeric, :desc => "the number of worker instances to launch (default: #{DEFAULTS[:instance_count]})"
|
74
|
+
method_option :instance_type, :type => :string, :desc => "the worker instance type, see http://ec2pricing.iconara.info/ for available types (default: #{DEFAULTS[:instance_type]})"
|
75
|
+
method_option :spot_instances, :type => :array, :lazy_default => [], :desc => 'use spot instances; either an explicit list of instance groups or no value to run all groups as spot instances'
|
76
|
+
method_option :bid_price, :type => :string, :desc => "how much to bid for spot instances, see http://ec2pricing.iconara.info/ for current spot prices (default: #{DEFAULTS[:bid_price]})"
|
77
|
+
method_option :poll, :type => :boolean, :desc => "poll the job's status every 10s and display (default: #{DEFAULTS[:poll]})"
|
78
|
+
method_option :skip_package, :type => :boolean, :desc => "don't package the JAR, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_package]})"
|
79
|
+
method_option :skip_prepare, :type => :boolean, :desc => "don't upload the JAR and bootstrap files, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_prepare]})"
|
80
|
+
method_option :extra_hadoop_args, :type => :array, :desc => "extra arguments to pass on to hadoop (default: #{DEFAULTS[:extra_hadoop_args]})"
|
81
|
+
method_option :ec2_key_name, :type => :string, :desc => 'The name of an EC2 key pair to enable SSH access to master node'
|
82
|
+
method_option :aws_region, :type => :string, :desc => "The AWS region where the EMR flow is to run (default: #{DEFAULTS[:aws_region]})"
|
83
|
+
method_option :hadoop_version, :type => :string, :desc => "The EMR Hadoop version to use (default: #{DEFAULTS[:hadoop_version]})"
|
84
|
+
def run_emr
|
85
|
+
check_job!
|
86
|
+
invoke(:package, [], {}) unless options.skip_package?
|
87
|
+
flow = EmrFlow.new(job_config, options[:input], job_package, emr, data_bucket, job_bucket, options[:output])
|
88
|
+
if options.cleanup_before?
|
89
|
+
say_status(:remove, flow.output_uri)
|
90
|
+
flow.cleanup!
|
91
|
+
end
|
92
|
+
unless options.skip_prepare?
|
93
|
+
say_status(:upload, flow.jar_uri)
|
94
|
+
flow.prepare!
|
95
|
+
end
|
96
|
+
say_status(:warning, "No EC2 key name configured. You will not be able to access the master node via SSH.", :yellow) unless options[:ec2_key_name]
|
97
|
+
job_flow = flow.run!(
|
98
|
+
bid_price: options[:bid_price],
|
99
|
+
instance_count: options[:instance_count],
|
100
|
+
instance_type: options[:instance_type],
|
101
|
+
spot_instances: options[:spot_instances],
|
102
|
+
extra_hadoop_args: options[:extra_hadoop_args],
|
103
|
+
ec2_key_name: options[:ec2_key_name],
|
104
|
+
hadoop_version: options[:hadoop_version]
|
105
|
+
)
|
106
|
+
File.open('.humboldtjob', 'w') { |io| io.puts(job_flow.job_flow_id) }
|
107
|
+
say_status(:started, %{EMR job flow "#{job_flow.job_flow_id}"})
|
108
|
+
end
|
109
|
+
|
110
|
+
desc 'emr-job', 'show status of the last EMR job'
|
111
|
+
def emr_job
|
112
|
+
if File.exists?('.humboldtjob')
|
113
|
+
job_flow_id = File.read('.humboldtjob').strip
|
114
|
+
job_flow = emr.job_flows[job_flow_id]
|
115
|
+
print_job_flow_extended_status(job_flow)
|
116
|
+
else
|
117
|
+
say_status(:warning, 'Could not determine last job flow ID')
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
desc 'emr-jobs', 'list all EMR jobs'
|
122
|
+
def emr_jobs
|
123
|
+
emr.job_flows.each do |job_flow|
|
124
|
+
print_job_flow_status(job_flow)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
desc 'configure', 'Configure humboldt for the current project'
|
129
|
+
def configure
|
130
|
+
say("Please ensure you are located at the root directory of the project you are configuring.", :yellow)
|
131
|
+
configuration = options_from_config_file
|
132
|
+
say('EMR configuration', :green)
|
133
|
+
configuration[:ec2_key_name] = ask("EC2 key pair name to enable SSH access to EMR master node: [#{config_file_options_with_defaults[:ec2_key_name]}]")
|
134
|
+
configuration[:aws_region] = ask("AWS region: [#{config_file_options_with_defaults[:aws_region]}]")
|
135
|
+
configuration[:hadoop_version] = ask("Hadoop version: [#{config_file_options_with_defaults[:hadoop_version]}]")
|
136
|
+
configuration[:data_bucket] = ask("Input data S3 bucket: [#{config_file_options_with_defaults[:data_bucket]}]")
|
137
|
+
configuration[:job_bucket] = ask("Job S3 bucket (where JAR is uploaded, output logs and job output go to): [#{config_file_options_with_defaults[:job_bucket]}]")
|
138
|
+
configuration.each do |key, value|
|
139
|
+
value = configuration[key] = config_file_options_with_defaults[key] if value.empty?
|
140
|
+
configuration.delete(key) if value.empty? || value == DEFAULTS[key]
|
141
|
+
end
|
142
|
+
File.open('.humboldt.yml', 'w') { |f| YAML.dump(configuration, f) }
|
143
|
+
say('Updated .humboldt.yml', :green)
|
144
|
+
end
|
145
|
+
|
146
|
+
no_commands do
|
147
|
+
def options
|
148
|
+
@extended_options ||= Thor::CoreExt::HashWithIndifferentAccess.new(config_file_options_with_defaults.merge(super))
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
private
|
153
|
+
|
154
|
+
ISO_DATE_TIME = '%Y-%m-%d %H:%M:%S'.freeze
|
155
|
+
|
156
|
+
def project_jar
|
157
|
+
@project_jar ||= Dir['build/*.jar'].reject { |path| path.start_with?('build/jruby-complete') }.first
|
158
|
+
end
|
159
|
+
|
160
|
+
def job_package
|
161
|
+
@job_package ||= Rubydoop::Package.new(lib_jars: Dir[File.expand_path('../../**/*.jar', __FILE__)])
|
162
|
+
end
|
163
|
+
|
164
|
+
def job_config
|
165
|
+
options[:job_config] || job_package.project_name
|
166
|
+
end
|
167
|
+
|
168
|
+
def default_hadoop_config_path
|
169
|
+
File.expand_path('../../../config/hadoop-local.xml', __FILE__)
|
170
|
+
end
|
171
|
+
|
172
|
+
def s3
|
173
|
+
@s3 ||= AWS::S3.new
|
174
|
+
end
|
175
|
+
|
176
|
+
def emr
|
177
|
+
@emr ||= AWS::EMR.new(region: options[:aws_region])
|
178
|
+
end
|
179
|
+
|
180
|
+
def job_bucket
|
181
|
+
@job_bucket ||= s3.buckets[options[:job_bucket]]
|
182
|
+
end
|
183
|
+
|
184
|
+
def data_bucket
|
185
|
+
@data_bucket ||= s3.buckets[options[:data_bucket]]
|
186
|
+
end
|
187
|
+
|
188
|
+
def check_job!
|
189
|
+
raise Thor::Error, "No such job: #{job_config}" unless File.exists?("lib/#{job_config}.rb")
|
190
|
+
end
|
191
|
+
|
192
|
+
def relative_path(path)
|
193
|
+
path.sub(Dir.pwd + '/', '')
|
194
|
+
end
|
195
|
+
|
196
|
+
def check_local_output!(path)
|
197
|
+
if File.exists?(path)
|
198
|
+
raise Thor::Error, "#{options[:output]} already exists!"
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def run_command(*args)
|
203
|
+
say_status(:running, 'Hadoop started')
|
204
|
+
Open3.popen3(*args) do |stdin, stdout, stderr, wait_thr|
|
205
|
+
stdin.close
|
206
|
+
stdout_printer = Thread.new(stdout) do |stdout|
|
207
|
+
while line = stdout.gets
|
208
|
+
say(line.chomp)
|
209
|
+
end
|
210
|
+
end
|
211
|
+
stderr_printer = Thread.new(stderr) do |stderr|
|
212
|
+
filter = HadoopStatusFilter.new(stderr, self, options.silent?)
|
213
|
+
filter.run
|
214
|
+
end
|
215
|
+
stdout_printer.join
|
216
|
+
stderr_printer.join
|
217
|
+
if wait_thr.value.exitstatus == 0
|
218
|
+
say_status(:done, 'Job completed')
|
219
|
+
else
|
220
|
+
say_status(:failed, 'Job failed', :red)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
def print_job_flow_extended_status(job_flow)
|
226
|
+
id = job_flow.job_flow_id
|
227
|
+
state = job_flow.state
|
228
|
+
created_at = job_flow.created_at.strftime(ISO_DATE_TIME)
|
229
|
+
change_reason = job_flow.last_state_change_reason
|
230
|
+
say_status(:started, created_at)
|
231
|
+
say_status(:state, state)
|
232
|
+
say_status(:change, change_reason)
|
233
|
+
rescue => e
|
234
|
+
say_status(:error, e.message, :red)
|
235
|
+
sleep 1
|
236
|
+
retry
|
237
|
+
end
|
238
|
+
|
239
|
+
def print_job_flow_status(job_flow)
|
240
|
+
id = job_flow.job_flow_id
|
241
|
+
state = job_flow.state
|
242
|
+
created_at = job_flow.created_at.strftime(ISO_DATE_TIME)
|
243
|
+
change_reason = job_flow.last_state_change_reason
|
244
|
+
say_status(:status, sprintf('%-15s %-10s %19s %s', id, state, created_at, change_reason))
|
245
|
+
rescue => e
|
246
|
+
say_status(:error, e.message, :red)
|
247
|
+
sleep 1
|
248
|
+
retry
|
249
|
+
end
|
250
|
+
|
251
|
+
def config_file_options_with_defaults
|
252
|
+
@config_file_options_with_defaults ||= DEFAULTS.merge(options_from_config_file)
|
253
|
+
end
|
254
|
+
|
255
|
+
def options_from_config_file
|
256
|
+
@options_from_config_file ||= begin
|
257
|
+
::YAML::load_file(".humboldt.yml")
|
258
|
+
rescue Errno::ENOENT
|
259
|
+
{}
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
@@ -0,0 +1,198 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Humboldt
|
4
|
+
class EmrFlow
|
5
|
+
attr_reader :output_path
|
6
|
+
|
7
|
+
def initialize(*args)
|
8
|
+
@job_name, @input_glob, @package, @emr, @data_bucket, @job_bucket, @output_path = args
|
9
|
+
@output_path ||= "#{@package.project_name}/#{@job_name}/output"
|
10
|
+
end
|
11
|
+
|
12
|
+
def prepare!
|
13
|
+
upload_bootstrap_task_files!
|
14
|
+
upload_jar!
|
15
|
+
end
|
16
|
+
|
17
|
+
def cleanup!
|
18
|
+
delete_output_dir!
|
19
|
+
end
|
20
|
+
|
21
|
+
def run!(launch_options={})
|
22
|
+
check_jar!
|
23
|
+
check_output_dir!
|
24
|
+
create_flow!(launch_options)
|
25
|
+
end
|
26
|
+
|
27
|
+
def jar_path
|
28
|
+
"#{@package.project_name}/#{File.basename(@package.jar_path)}"
|
29
|
+
end
|
30
|
+
|
31
|
+
def jar_uri
|
32
|
+
s3_uri(jar_path)
|
33
|
+
end
|
34
|
+
|
35
|
+
def output_uri
|
36
|
+
s3_uri(output_path)
|
37
|
+
end
|
38
|
+
|
39
|
+
def log_path
|
40
|
+
"#{@package.project_name}/#{@job_name}/logs"
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
BOOTSTRAP_TASK_FILES = {
|
46
|
+
:remove_old_jruby => 'config/emr-bootstrap/remove_old_jruby.sh'
|
47
|
+
}.freeze
|
48
|
+
|
49
|
+
def s3_uri(path, options={})
|
50
|
+
protocol = options[:protocol] || 's3'
|
51
|
+
bucket = options[:bucket] || @job_bucket
|
52
|
+
"#{protocol}://#{bucket.name}/#{path}"
|
53
|
+
end
|
54
|
+
|
55
|
+
def upload_bootstrap_task_files!
|
56
|
+
BOOTSTRAP_TASK_FILES.values.each do |local_path|
|
57
|
+
remote_obj = @job_bucket.objects["#{@package.project_name}/#{local_path}"]
|
58
|
+
remote_obj.write(Pathname.new(File.expand_path(local_path, "#{__FILE__}/../../..")))
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def upload_jar!
|
63
|
+
# TODO: upload only if not exists and MD5 != ETag
|
64
|
+
jar_obj = @job_bucket.objects[jar_path]
|
65
|
+
jar_obj.write(Pathname.new(@package.jar_path))
|
66
|
+
end
|
67
|
+
|
68
|
+
def check_jar!
|
69
|
+
unless @job_bucket.objects.with_prefix(jar_path).any?
|
70
|
+
raise "Job JAR missing (#{s3_uri(jar_path)}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def check_output_dir!
|
75
|
+
if @job_bucket.objects.with_prefix(output_path).any?
|
76
|
+
raise "Output directory already exists (#{s3_uri(output_path)})"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def delete_output_dir!
|
81
|
+
@job_bucket.objects.with_prefix(output_path).delete_all
|
82
|
+
end
|
83
|
+
|
84
|
+
def job_flow_configuration(launch_options)
|
85
|
+
{
|
86
|
+
:log_uri => s3_uri(log_path),
|
87
|
+
:instances => instance_configuration(launch_options),
|
88
|
+
:steps => [step_configuration(launch_options)],
|
89
|
+
:bootstrap_actions => bootstrap_actions,
|
90
|
+
:visible_to_all_users => true
|
91
|
+
}
|
92
|
+
end
|
93
|
+
|
94
|
+
def instance_configuration(launch_options)
|
95
|
+
{
|
96
|
+
:ec2_key_name => launch_options[:ec2_key_name],
|
97
|
+
:hadoop_version => launch_options[:hadoop_version],
|
98
|
+
:instance_groups => InstanceGroupConfiguration.create(launch_options)
|
99
|
+
}
|
100
|
+
end
|
101
|
+
|
102
|
+
def bootstrap_actions
|
103
|
+
remove_old_jruby_action = {
|
104
|
+
:name => 'remove_old_jruby',
|
105
|
+
:script_bootstrap_action => {
|
106
|
+
:path => s3_uri("#{@package.project_name}/#{BOOTSTRAP_TASK_FILES[:remove_old_jruby]}")
|
107
|
+
}
|
108
|
+
}
|
109
|
+
|
110
|
+
# http://hadoop.apache.org/docs/r1.0.3/mapred-default.html
|
111
|
+
configure_hadoop_action = {
|
112
|
+
:name => 'configure_hadoop',
|
113
|
+
:script_bootstrap_action => {
|
114
|
+
:path => 's3://eu-west-1.elasticmapreduce/bootstrap-actions/configure-hadoop',
|
115
|
+
:args => [
|
116
|
+
'-m', 'mapred.job.reuse.jvm.num.tasks=-1',
|
117
|
+
'-m', 'mapred.map.tasks.speculative.execution=false',
|
118
|
+
'-m', 'mapred.reduce.tasks.speculative.execution=false'
|
119
|
+
]
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
[remove_old_jruby_action, configure_hadoop_action]
|
124
|
+
end
|
125
|
+
|
126
|
+
def step_configuration(launch_options)
|
127
|
+
{
|
128
|
+
:name => @package.project_name,
|
129
|
+
:hadoop_jar_step => {
|
130
|
+
:jar => s3_uri(jar_path),
|
131
|
+
:args => [
|
132
|
+
@job_name,
|
133
|
+
s3_uri(@input_glob, protocol: 's3n', bucket: @data_bucket),
|
134
|
+
s3_uri(output_path, protocol: 's3n'),
|
135
|
+
*launch_options[:extra_hadoop_args]
|
136
|
+
]
|
137
|
+
}
|
138
|
+
}
|
139
|
+
end
|
140
|
+
|
141
|
+
def create_flow!(launch_options)
|
142
|
+
job_flow = @emr.job_flows.create(@package.project_name, job_flow_configuration(launch_options))
|
143
|
+
end
|
144
|
+
|
145
|
+
module InstanceGroupConfiguration
|
146
|
+
extend self
|
147
|
+
|
148
|
+
# TODO: add 'task' group when support is added for 'tasks'
|
149
|
+
INSTANCE_GROUPS = %w[master core].freeze
|
150
|
+
MASTER_INSTANCE_TYPE = 'm1.small'.freeze
|
151
|
+
DEFAULT_CORE_INSTANCE_TYPE = 'c1.xlarge'.freeze
|
152
|
+
DEFAULT_BID_PRICE = '0.2'.freeze
|
153
|
+
DEFAULT_CORE_INSTANCE_COUNT = 4
|
154
|
+
|
155
|
+
INSTANCE_TYPE_MAPPINGS = {
|
156
|
+
'master' => MASTER_INSTANCE_TYPE,
|
157
|
+
'core' => DEFAULT_CORE_INSTANCE_TYPE
|
158
|
+
}.freeze
|
159
|
+
|
160
|
+
INSTANCE_COUNT_MAPPINGS = {
|
161
|
+
'master' => 1,
|
162
|
+
'core' => DEFAULT_CORE_INSTANCE_COUNT
|
163
|
+
}.freeze
|
164
|
+
|
165
|
+
def base_configuration(group)
|
166
|
+
{:name => "#{group.capitalize} Group", :instance_role => group.upcase}
|
167
|
+
end
|
168
|
+
|
169
|
+
def configure_type_and_count(group, configuration, options = {})
|
170
|
+
if group == 'core'
|
171
|
+
configuration[:instance_type] = options[:instance_type]
|
172
|
+
configuration[:instance_count] = options[:instance_count]
|
173
|
+
end
|
174
|
+
|
175
|
+
configuration[:instance_type] ||= INSTANCE_TYPE_MAPPINGS[group]
|
176
|
+
configuration[:instance_count] ||= INSTANCE_COUNT_MAPPINGS[group]
|
177
|
+
end
|
178
|
+
|
179
|
+
def configure_market(group, configuration, spot_instances, bid_price)
|
180
|
+
if spot_instances && (spot_instances.empty? || spot_instances.include?(group))
|
181
|
+
configuration[:market] = 'SPOT'
|
182
|
+
configuration[:bid_price] = bid_price || DEFAULT_BID_PRICE
|
183
|
+
else
|
184
|
+
configuration[:market] = 'ON_DEMAND'
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
def create(options)
|
189
|
+
INSTANCE_GROUPS.map do |group|
|
190
|
+
configuration = base_configuration(group)
|
191
|
+
configure_type_and_count(group, configuration, options)
|
192
|
+
configure_market(group, configuration, options[:spot_instances], options[:bid_price])
|
193
|
+
configuration
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Humboldt
|
4
|
+
class HadoopStatusFilter
|
5
|
+
def initialize(hadoop_stderr, shell, silent)
|
6
|
+
@hadoop_stderr = hadoop_stderr
|
7
|
+
@shell = shell
|
8
|
+
@silent = silent
|
9
|
+
@counters = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
counter_group = nil
|
14
|
+
while line = @hadoop_stderr.gets
|
15
|
+
if @counters_printing && (hadoop_log?(line) || line =~ /^\t+/)
|
16
|
+
case line.chomp
|
17
|
+
when /(?:JobClient: |\t+)([^\t]+)=(\d+)$/
|
18
|
+
if counter_group
|
19
|
+
@counters[counter_group] ||= {}
|
20
|
+
@counters[counter_group][$1.strip] = $2.to_i
|
21
|
+
end
|
22
|
+
when /(?:JobClient: |\t+)([^\t]+)$/
|
23
|
+
counter_group = $1.strip
|
24
|
+
end
|
25
|
+
elsif @error_printing && !hadoop_log?(line) && !ignore?(line)
|
26
|
+
report_error(line)
|
27
|
+
elsif ignore?(line)
|
28
|
+
# do nothing
|
29
|
+
else
|
30
|
+
@counters_printing = false
|
31
|
+
@error_printing = false
|
32
|
+
case line
|
33
|
+
when /map (\d+)% reduce (\d+)%/
|
34
|
+
report_progress($1, $2)
|
35
|
+
when /Counters: \d+/
|
36
|
+
@counters_printing = true
|
37
|
+
else
|
38
|
+
unless hadoop_log?(line)
|
39
|
+
@error_printing = true
|
40
|
+
if line =~ /warning(!|:)/i
|
41
|
+
@error_type = :warning
|
42
|
+
else
|
43
|
+
@error_type = :error
|
44
|
+
end
|
45
|
+
report_error(line)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
@shell.say(line.chomp, :red) unless @silent
|
50
|
+
end
|
51
|
+
print_counters_table
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def hadoop_log?(line)
|
57
|
+
line =~ /(?:INFO|WARN) (?:mapred|input|output|util|jvm|mapreduce)\./
|
58
|
+
end
|
59
|
+
|
60
|
+
def ignore?(line)
|
61
|
+
case line
|
62
|
+
when /^\s*$/,
|
63
|
+
/Warning: \$HADOOP_HOME is deprecated/,
|
64
|
+
/Unable to load realm info from SCDynamicStore/,
|
65
|
+
/Unable to load native-hadoop library/,
|
66
|
+
/Snappy native library not loaded/,
|
67
|
+
/Configuration.deprecation:/,
|
68
|
+
/WARN conf.Configuration.*attempt to override final parameter.*ignoring/i
|
69
|
+
true
|
70
|
+
else
|
71
|
+
false
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def report_progress(map, reduce)
|
76
|
+
@shell.say_status(:progress, "map #{map}%, reduce #{reduce}%")
|
77
|
+
end
|
78
|
+
|
79
|
+
def report_error(line)
|
80
|
+
@shell.say_status(@error_type, line.chomp, @error_type == :error ? :red : :yellow)
|
81
|
+
end
|
82
|
+
|
83
|
+
def print_counters_table
|
84
|
+
table = @counters.flat_map do |group, counters|
|
85
|
+
[
|
86
|
+
[group, *counters.first],
|
87
|
+
*counters.drop(1).map { |counter, value| ['', counter, value] },
|
88
|
+
['', '', '']
|
89
|
+
]
|
90
|
+
end
|
91
|
+
table.pop
|
92
|
+
@shell.say
|
93
|
+
@shell.print_table(table)
|
94
|
+
@shell.say
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Humboldt
|
4
|
+
class Mapper < Processor
|
5
|
+
class << self
|
6
|
+
def map(&block)
|
7
|
+
define_method(:map) do |key, value, context|
|
8
|
+
@in_key.hadoop = key
|
9
|
+
@in_value.hadoop = value
|
10
|
+
instance_exec(@in_key.ruby, @in_value.ruby, &block)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'zlib'
|
4
|
+
|
5
|
+
|
6
|
+
module Humboldt
|
7
|
+
class BinaryPrefixPartitioner
|
8
|
+
def initialize(cutoff_index)
|
9
|
+
@cutoff_index = cutoff_index
|
10
|
+
end
|
11
|
+
|
12
|
+
def partition(key, value, num_partitions)
|
13
|
+
length = @cutoff_index > key.length ? key.length : @cutoff_index
|
14
|
+
prefix = String.from_java_bytes(key.bytes)[0, length]
|
15
|
+
Zlib.crc32(prefix) % num_partitions
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class DropBinaryPrefixPartitioner < BinaryPrefixPartitioner
|
20
|
+
def partition(key, value, num_partitions)
|
21
|
+
length = key.length > @cutoff_index ? key.length - @cutoff_index : 0
|
22
|
+
prefix = String.from_java_bytes(key.bytes)[0, length]
|
23
|
+
Zlib.crc32(prefix) % num_partitions
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class BinaryPrefixComparator
|
28
|
+
def initialize(cutoff_index)
|
29
|
+
@cutoff_index = cutoff_index
|
30
|
+
end
|
31
|
+
|
32
|
+
def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
|
33
|
+
subset_length1 = @cutoff_index > length1 ? length1 : @cutoff_index
|
34
|
+
subset_length2 = @cutoff_index > length2 ? length2 : @cutoff_index
|
35
|
+
::Hadoop::Io::WritableComparator.compareBytes(bytes1, start1, subset_length1, bytes2, start2, subset_length2)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class DropBinaryPrefixComparator < BinaryPrefixComparator
|
40
|
+
def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
|
41
|
+
subset_length1 = length1 - @cutoff_index
|
42
|
+
subset_length2 = length2 - @cutoff_index
|
43
|
+
::Hadoop::Io::WritableComparator.compareBytes(bytes1, start1, subset_length1, bytes2, start2, subset_length2)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Humboldt
|
4
|
+
class Processor
|
5
|
+
class << self
|
6
|
+
def self.type_accessor(*names)
|
7
|
+
names.each do |name|
|
8
|
+
module_eval <<-EOA
|
9
|
+
def #{name}
|
10
|
+
@#{name} || superclass.#{name}
|
11
|
+
end
|
12
|
+
def #{name}=(type)
|
13
|
+
@#{name} = TypeConverter[type]
|
14
|
+
define_method(:#{name}_accessor) do
|
15
|
+
TypeConverter[type].new
|
16
|
+
end
|
17
|
+
end
|
18
|
+
EOA
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
type_accessor :input_key, :input_value, :output_key, :output_value
|
23
|
+
|
24
|
+
def input(*types)
|
25
|
+
self.input_key = types.first
|
26
|
+
self.input_value = types.last
|
27
|
+
end
|
28
|
+
|
29
|
+
def output(*types)
|
30
|
+
self.output_key = types.first
|
31
|
+
self.output_value = types.last
|
32
|
+
end
|
33
|
+
|
34
|
+
def setup(&block)
|
35
|
+
define_method(:instance_setup, &block)
|
36
|
+
private(:instance_setup)
|
37
|
+
end
|
38
|
+
|
39
|
+
def cleanup(&block)
|
40
|
+
define_method(:instance_cleanup, &block)
|
41
|
+
private(:instance_cleanup)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
attr_reader :current_context
|
46
|
+
|
47
|
+
def setup(context)
|
48
|
+
@current_context = context
|
49
|
+
@in_key = input_key_accessor
|
50
|
+
@in_value = input_value_accessor
|
51
|
+
@out_key = output_key_accessor
|
52
|
+
@out_value = output_value_accessor
|
53
|
+
unless Hadoop::Mapreduce::Job.instance_methods.include?(:add_cache_file)
|
54
|
+
create_symlinks!
|
55
|
+
end
|
56
|
+
instance_setup
|
57
|
+
end
|
58
|
+
|
59
|
+
def cleanup(context)
|
60
|
+
instance_cleanup
|
61
|
+
end
|
62
|
+
|
63
|
+
protected
|
64
|
+
|
65
|
+
def emit(key, value)
|
66
|
+
@out_key.ruby = key
|
67
|
+
@out_value.ruby = value
|
68
|
+
@current_context.write(@out_key.hadoop, @out_value.hadoop)
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
def instance_setup
|
74
|
+
end
|
75
|
+
|
76
|
+
def instance_cleanup
|
77
|
+
end
|
78
|
+
|
79
|
+
def create_symlinks!
|
80
|
+
distributed_cache = ::Hadoop::FileCache::DistributedCache
|
81
|
+
files = distributed_cache.get_cache_files(@current_context.configuration)
|
82
|
+
local_files = distributed_cache.get_local_cache_files(@current_context.configuration)
|
83
|
+
if files && local_files
|
84
|
+
work_dir = ENV['HADOOP_WORK_DIR']
|
85
|
+
files.each_with_index do |file, i|
|
86
|
+
target = local_files[i].to_s
|
87
|
+
link_path = File.join(work_dir, file.fragment)
|
88
|
+
FileUtils.mkdir_p(File.dirname(link_path))
|
89
|
+
unless File.exists?(link_path)
|
90
|
+
FileUtils.ln_s(target, link_path)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Humboldt
|
4
|
+
class Reducer < Processor
|
5
|
+
class << self
|
6
|
+
def reduce(&block)
|
7
|
+
define_method(:reduce) do |key, values, context|
|
8
|
+
@in_key.hadoop = key
|
9
|
+
values_enumerator = TypeConversionEnumerator.new(@in_value, values.iterator)
|
10
|
+
instance_exec(@in_key.ruby, values_enumerator, &block)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class TypeConversionEnumerator < Enumerator
|
16
|
+
def initialize(*args)
|
17
|
+
@value_converter, @hadoop_iterator = args
|
18
|
+
end
|
19
|
+
|
20
|
+
def each
|
21
|
+
while @hadoop_iterator.has_next
|
22
|
+
@value_converter.hadoop = @hadoop_iterator.next
|
23
|
+
yield @value_converter.ruby
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def next
|
28
|
+
raise StopIteration unless @hadoop_iterator.has_next
|
29
|
+
@value_converter.hadoop = @hadoop_iterator.next
|
30
|
+
@value_converter.ruby
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'humboldt'
|
4
|
+
|
5
|
+
|
6
|
+
module RunnerHelpers
|
7
|
+
def run_nokey_mapper(mapper, *values, &context_callback)
|
8
|
+
key = mapper.input_key_accessor.ruby
|
9
|
+
args = values.map { |value| [key, value] }
|
10
|
+
run_mapper(mapper, *args, &context_callback)
|
11
|
+
end
|
12
|
+
|
13
|
+
def run_mapper(mapper, *entries, &context_callback)
|
14
|
+
in_value = mapper.input_value_accessor
|
15
|
+
run(mapper, :map, context_callback, *entries) do |value|
|
16
|
+
in_value.ruby = value
|
17
|
+
in_value.hadoop
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def run_reducer(reducer, *entries, &context_callback)
|
22
|
+
run(reducer, :reduce, context_callback, *entries) do |value|
|
23
|
+
fake_iterator(*value.map do |v|
|
24
|
+
in_value = reducer.input_value_accessor
|
25
|
+
in_value.ruby = v
|
26
|
+
in_value.hadoop
|
27
|
+
end)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def run(runner, method, context_callback, *entries)
|
32
|
+
in_key = runner.input_key_accessor
|
33
|
+
context = FakeContext.new(runner.output_key_accessor, runner.output_value_accessor)
|
34
|
+
context_callback.call(context) if context_callback
|
35
|
+
runner.setup(context)
|
36
|
+
entries.each do |entry|
|
37
|
+
in_key.ruby = entry.first
|
38
|
+
runner.send(method, in_key.hadoop, yield(entry.last), context)
|
39
|
+
end
|
40
|
+
runner.cleanup(context)
|
41
|
+
context.results
|
42
|
+
end
|
43
|
+
|
44
|
+
def fake_iterator(*values)
|
45
|
+
FakeIterable.new(values)
|
46
|
+
end
|
47
|
+
|
48
|
+
class FakeIterable
|
49
|
+
def initialize(values)
|
50
|
+
@values = values
|
51
|
+
end
|
52
|
+
def iterator
|
53
|
+
FakeIterator.new(@values.dup)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class FakeIterator
|
58
|
+
def initialize(values)
|
59
|
+
@values = values
|
60
|
+
end
|
61
|
+
def has_next
|
62
|
+
!@values.empty?
|
63
|
+
end
|
64
|
+
def next
|
65
|
+
@values.shift
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
class FakeContext
|
70
|
+
attr_reader :results, :counters
|
71
|
+
|
72
|
+
def initialize(key_accessor, value_accessor)
|
73
|
+
@key_accessor, @value_accessor = key_accessor, value_accessor
|
74
|
+
@results = []
|
75
|
+
@counters = Hash.new { |h,k| h[k] = Hash.new { |h2,k2| h2[k2] = 0 } }
|
76
|
+
end
|
77
|
+
|
78
|
+
def write(key, value)
|
79
|
+
@key_accessor.hadoop = key
|
80
|
+
@value_accessor.hadoop = value
|
81
|
+
@results << [@key_accessor.ruby, @value_accessor.ruby]
|
82
|
+
end
|
83
|
+
|
84
|
+
def configuration
|
85
|
+
@configuration ||= ::Hadoop::Conf::Configuration.new.tap do |config|
|
86
|
+
config.set 'mapred.job.tracker', 'local'
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def get_counter(group, name)
|
91
|
+
FakeCounter.new do |amount|
|
92
|
+
@counters[group][name] += amount
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
RSpec.configure do |conf|
|
99
|
+
conf.include(RunnerHelpers)
|
100
|
+
end
|
@@ -0,0 +1,180 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module Humboldt
|
4
|
+
module TypeConverter
|
5
|
+
class Binary
|
6
|
+
HADOOP = ::Hadoop::Io::BytesWritable
|
7
|
+
RUBY = ::String
|
8
|
+
|
9
|
+
attr_reader :hadoop
|
10
|
+
|
11
|
+
def hadoop=(value)
|
12
|
+
unless value.is_a?(HADOOP)
|
13
|
+
raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
|
14
|
+
end
|
15
|
+
@hadoop = value
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize
|
19
|
+
@hadoop = HADOOP.new
|
20
|
+
end
|
21
|
+
|
22
|
+
def ruby
|
23
|
+
String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length)
|
24
|
+
end
|
25
|
+
|
26
|
+
def ruby=(value)
|
27
|
+
unless value.is_a?(RUBY)
|
28
|
+
raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
|
29
|
+
end
|
30
|
+
|
31
|
+
@hadoop.set(value.to_java_bytes, 0, value.bytesize)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
begin
|
36
|
+
require 'msgpack'
|
37
|
+
|
38
|
+
class Encoded < Binary
|
39
|
+
def ruby=(value)
|
40
|
+
unless value.is_a?(Hash) || value.is_a?(Array)
|
41
|
+
raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected Hash or Array"
|
42
|
+
end
|
43
|
+
packed = MessagePack.pack(value)
|
44
|
+
@hadoop.set(packed.to_java_bytes, 0, packed.bytesize)
|
45
|
+
end
|
46
|
+
|
47
|
+
def ruby
|
48
|
+
packed = String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length)
|
49
|
+
MessagePack.unpack(packed, encoding: Encoding::UTF_8)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
rescue LoadError
|
53
|
+
end
|
54
|
+
|
55
|
+
class Text
|
56
|
+
HADOOP = ::Hadoop::Io::Text
|
57
|
+
RUBY = ::String
|
58
|
+
|
59
|
+
attr_reader :hadoop
|
60
|
+
|
61
|
+
def hadoop=(value)
|
62
|
+
unless value.is_a?(HADOOP)
|
63
|
+
raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
|
64
|
+
end
|
65
|
+
@hadoop = value
|
66
|
+
end
|
67
|
+
|
68
|
+
def initialize
|
69
|
+
@hadoop = HADOOP.new
|
70
|
+
end
|
71
|
+
|
72
|
+
def ruby
|
73
|
+
String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length).force_encoding(Encoding::UTF_8)
|
74
|
+
end
|
75
|
+
|
76
|
+
def ruby=(value)
|
77
|
+
unless value.is_a?(RUBY)
|
78
|
+
raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
|
79
|
+
end
|
80
|
+
|
81
|
+
if value.encoding == Encoding::UTF_8
|
82
|
+
@hadoop.set(value.to_java_bytes, 0, value.bytesize)
|
83
|
+
else
|
84
|
+
@hadoop.set(value)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
begin
|
90
|
+
require 'json'
|
91
|
+
|
92
|
+
class Json < Text
|
93
|
+
def ruby=(value)
|
94
|
+
unless value.is_a?(Hash) || value.is_a?(Array)
|
95
|
+
raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected Hash or Array"
|
96
|
+
end
|
97
|
+
@hadoop.set(JSON.generate(value))
|
98
|
+
end
|
99
|
+
|
100
|
+
def ruby
|
101
|
+
JSON.parse(hadoop.to_s)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
class Long
|
107
|
+
HADOOP = ::Hadoop::Io::LongWritable
|
108
|
+
RUBY = ::Integer
|
109
|
+
|
110
|
+
attr_reader :hadoop
|
111
|
+
|
112
|
+
def hadoop=(value)
|
113
|
+
unless value.is_a?(HADOOP)
|
114
|
+
raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
|
115
|
+
end
|
116
|
+
@hadoop = value
|
117
|
+
end
|
118
|
+
|
119
|
+
def initialize
|
120
|
+
@hadoop = HADOOP.new
|
121
|
+
end
|
122
|
+
|
123
|
+
def ruby
|
124
|
+
@hadoop.get
|
125
|
+
end
|
126
|
+
|
127
|
+
def ruby=(value)
|
128
|
+
unless value.is_a?(Integer)
|
129
|
+
raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
|
130
|
+
end
|
131
|
+
|
132
|
+
@hadoop.set value
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
class None
|
137
|
+
HADOOP = ::Hadoop::Io::NullWritable
|
138
|
+
RUBY = ::NilClass
|
139
|
+
|
140
|
+
def hadoop
|
141
|
+
HADOOP.get
|
142
|
+
end
|
143
|
+
|
144
|
+
def hadoop=(value)
|
145
|
+
unless value.is_a?(HADOOP)
|
146
|
+
raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def ruby
|
151
|
+
nil
|
152
|
+
end
|
153
|
+
|
154
|
+
def ruby=(value)
|
155
|
+
unless value.nil?
|
156
|
+
raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
TYPE_CONVERTER_CLASS_CACHE = Hash.new { |h,k| h[k] = const_get(k.to_s.capitalize) }
|
162
|
+
|
163
|
+
def self.[](name)
|
164
|
+
TYPE_CONVERTER_CLASS_CACHE[name]
|
165
|
+
end
|
166
|
+
|
167
|
+
FROM_HADOOP_MAPPINGS = {
|
168
|
+
::Hadoop::Io::Text => Text,
|
169
|
+
::Hadoop::Io::BytesWritable => Binary,
|
170
|
+
::Hadoop::Io::LongWritable => Long,
|
171
|
+
::Hadoop::Io::NullWritable => None
|
172
|
+
}.freeze
|
173
|
+
|
174
|
+
def self.from_hadoop(hadoop_class)
|
175
|
+
accessor = FROM_HADOOP_MAPPINGS[hadoop_class]
|
176
|
+
raise ArgumentError, "Unsupported Hadoop type: #{hadoop_class}" unless accessor
|
177
|
+
accessor
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
data/lib/humboldt.jar
ADDED
Binary file
|
data/lib/humboldt.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'fileutils'
|
4
|
+
require 'rubydoop'
|
5
|
+
require 'hadoop'
|
6
|
+
|
7
|
+
require 'humboldt/java_lib'
|
8
|
+
|
9
|
+
require 'ext/hadoop'
|
10
|
+
require 'ext/rubydoop'
|
11
|
+
|
12
|
+
require 'humboldt/type_converters'
|
13
|
+
require 'humboldt/processor'
|
14
|
+
require 'humboldt/mapper'
|
15
|
+
require 'humboldt/reducer'
|
16
|
+
require 'humboldt/prefix_grouping'
|
metadata
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: humboldt
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: java
|
6
|
+
authors:
|
7
|
+
- The Burt Platform Team
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-06-03 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - '>='
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '0'
|
19
|
+
name: thor
|
20
|
+
prerelease: false
|
21
|
+
type: :runtime
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.1.2
|
33
|
+
name: rubydoop
|
34
|
+
prerelease: false
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.1.2
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 1.16.0
|
47
|
+
- - <
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 1.33.0
|
50
|
+
name: aws-sdk
|
51
|
+
prerelease: false
|
52
|
+
type: :runtime
|
53
|
+
version_requirements: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - '>='
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: 1.16.0
|
58
|
+
- - <
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.33.0
|
61
|
+
description: Humboldt provides a mapreduce API abstraction built on top of Rubydoop, and tools to run Hadoop jobs effortlessly both locally and on Amazon EMR
|
62
|
+
email:
|
63
|
+
- theo@burtcorp.com
|
64
|
+
executables:
|
65
|
+
- humboldt
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- bin/humboldt
|
70
|
+
- config/emr-bootstrap/remove_old_jruby.sh
|
71
|
+
- config/hadoop-local.xml
|
72
|
+
- lib/ext/hadoop.rb
|
73
|
+
- lib/ext/rubydoop.rb
|
74
|
+
- lib/humboldt.jar
|
75
|
+
- lib/humboldt.rb
|
76
|
+
- lib/humboldt/cli.rb
|
77
|
+
- lib/humboldt/emr_flow.rb
|
78
|
+
- lib/humboldt/hadoop_status_filter.rb
|
79
|
+
- lib/humboldt/java_lib.rb
|
80
|
+
- lib/humboldt/mapper.rb
|
81
|
+
- lib/humboldt/patterns/sum_reducer.rb
|
82
|
+
- lib/humboldt/prefix_grouping.rb
|
83
|
+
- lib/humboldt/processor.rb
|
84
|
+
- lib/humboldt/reducer.rb
|
85
|
+
- lib/humboldt/rspec.rb
|
86
|
+
- lib/humboldt/type_converters.rb
|
87
|
+
- lib/humboldt/version.rb
|
88
|
+
homepage: http://github.com/burtcorp/humboldt
|
89
|
+
licenses:
|
90
|
+
- BSD-3-Clause
|
91
|
+
metadata: {}
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - '>='
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - '>='
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
requirements: []
|
107
|
+
rubyforge_project:
|
108
|
+
rubygems_version: 2.2.2
|
109
|
+
signing_key:
|
110
|
+
specification_version: 4
|
111
|
+
summary: Tools and libraries for simplifying running Rubydoop jobs locally and on AWS Elastic MapReduce
|
112
|
+
test_files: []
|