humboldt 1.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7ca8c0825572a27fc4673c042f4d5677e250340a
4
+ data.tar.gz: 910d598f4df79e42eda1bbf4d022db9a160c48c3
5
+ SHA512:
6
+ metadata.gz: 6cf04eb473b93684dfde74f56ed6113d9c8d15fbfcc7686b33d878570a804ee5c29c11e0fdc29692b689f9b5039e8f0a21a93781d669c2ae4d4dc8384f7d436f
7
+ data.tar.gz: 84d6fab2395c97ebda5eac18684371ee475ee336823a4cb936f11196b8905911436af418bfb3386649c6c3c2097095c38a1aab2ada827f56e33f8020c50e6f38
data/bin/humboldt ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.expand_path('../../lib', __FILE__)
4
+
5
+ require 'humboldt/cli'
6
+
7
+
8
+ Humboldt::Cli.start
@@ -0,0 +1,11 @@
1
+ #!/bin/bash
2
+
3
+ if [ -e /home/hadoop/lib/jruby-complete-no-joda-1.6.5.jar ]
4
+ then
5
+ rm /home/hadoop/lib/jruby-complete-no-joda-1.6.5.jar
6
+ fi
7
+
8
+ if [ -e /home/hadoop/lib/jruby-complete-1.6.8.jar ]
9
+ then
10
+ rm /home/hadoop/lib/jruby-complete-1.6.8.jar
11
+ fi
@@ -0,0 +1,12 @@
1
+ <?xml version="1.0"?>
2
+
3
+ <configuration>
4
+ <property>
5
+ <name>fs.default.name</name>
6
+ <value>file:///</value>
7
+ </property>
8
+ <property>
9
+ <name>mapred.job.tracker</name>
10
+ <value>local</value>
11
+ </property>
12
+ </configuration>
data/lib/ext/hadoop.rb ADDED
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+
3
+ module Hadoop
4
+ module FileCache
5
+ include_package 'org.apache.hadoop.filecache'
6
+ end
7
+ module Conf
8
+ include_package 'org.apache.hadoop.conf'
9
+ end
10
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+
3
+ module Rubydoop
4
+ class JobDefinition
5
+ alias mapperrr mapper
6
+ def mapper(cls)
7
+ map_output_key cls.output_key.const_get(:HADOOP) if cls.respond_to?(:output_key)
8
+ map_output_value cls.output_value.const_get(:HADOOP) if cls.respond_to?(:output_value)
9
+ mapperrr cls
10
+ end
11
+
12
+ alias reducerrr reducer
13
+ def reducer(cls)
14
+ output_key cls.output_key.const_get(:HADOOP) if cls.respond_to?(:output_key)
15
+ output_value cls.output_value.const_get(:HADOOP) if cls.respond_to?(:output_value)
16
+ reducerrr cls
17
+ end
18
+
19
+ alias inputtt input
20
+ def input(paths, options={})
21
+ options = options.dup
22
+ format = options[:format]
23
+ STDERR.puts "Warning! Using `format: :combined_text` will not work with remote input paths (e.g. S3) and Hadoop 1.x. Cf. https://issues.apache.org/jira/browse/MAPREDUCE-1806" if format == :combined_text
24
+ unless format.nil? or format.is_a?(Class)
25
+ class_name = format.to_s.gsub(/^.|_./) {|x| x[-1,1].upcase } + "InputFormat"
26
+ begin
27
+ options[:format] = Humboldt::JavaLib.const_get(class_name)
28
+ rescue NameError
29
+ end
30
+ end
31
+ inputtt(paths, options)
32
+ end
33
+
34
+ def enable_compression!
35
+ unless local_mode?
36
+ set 'mapred.compress.map.output', true
37
+ set 'mapred.output.compress', true
38
+ set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
39
+ set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
40
+ set 'mapred.output.compression.type', 'BLOCK'
41
+ end
42
+ end
43
+
44
+ def local_mode?
45
+ @job.configuration.get('mapred.job.tracker') == 'local'
46
+ end
47
+
48
+ def cache_file(file, options = {})
49
+ symlink = options.fetch(:as, File.basename(file))
50
+ if local_mode? && !Hadoop::Mapreduce::Job.instance_methods.include?(:add_cache_file)
51
+ unless File.symlink?(symlink) && File.readlink(symlink) == file
52
+ FileUtils.ln_s file, symlink
53
+ end
54
+ else
55
+ uri = java.net.URI.new("#{file}\##{symlink}")
56
+ Hadoop::FileCache::DistributedCache.add_cache_file(uri, @job.configuration)
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,263 @@
1
+ # encoding: utf-8
2
+
3
+ require 'thor'
4
+ require 'aws'
5
+ require 'open3'
6
+ require 'rubydoop/package' # this prints an annoying warning in JRuby 1.7.0.RC1
7
+ require 'humboldt/emr_flow'
8
+ require 'humboldt/hadoop_status_filter'
9
+
10
+
11
+
12
+ module Humboldt
13
+ class Cli < Thor
14
+ include Thor::Actions
15
+
16
+ DEFAULTS = {
17
+ data_path: 'data/completes',
18
+ silent: true,
19
+ skip_package: false,
20
+ extra_hadoop_args: [],
21
+ cleanup_before: false,
22
+ instance_count: 4,
23
+ instance_type: 'c1.xlarge',
24
+ spot_instances: nil,
25
+ bid_price: 0.2,
26
+ poll: false,
27
+ skip_prepare: false,
28
+ aws_region: 'eu-west-1',
29
+ hadoop_version: '1.0.3'
30
+ }
31
+
32
+ desc 'package', 'Package job JAR file'
33
+ def package
34
+ say_status(:package, relative_path(job_package.jar_path))
35
+ job_package.create!
36
+ end
37
+
38
+ desc 'run-local', 'run a job in local mode with the hadoop command'
39
+ method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved agains the data path'
40
+ method_option :output, :type => :string, :desc => 'the output directory, defaults to "data/<job_config>/output"'
41
+ method_option :job_config, :type => 'string', :desc => 'the name of the Ruby file containing the job configuration, defaults to the project name (e.g. "lib/<job_config>.rb")'
42
+ method_option :hadoop_config, :type => 'string', :desc => 'the path to a Hadoop configuration XML file, defaults to Humboldt-provided config that runs Hadoop in local-mode'
43
+ method_option :cleanup_before, :type => :boolean, :desc => "automatically remove the output dir before launching (default: #{DEFAULTS[:cleanup_before]})"
44
+ method_option :data_path, :type => :string, :desc => "input paths will be resolved against this path (default: #{DEFAULTS[:data_path]})"
45
+ method_option :silent, :type => :boolean, :desc => "silence the hadoop command's logging (default: #{DEFAULTS[:silent]})"
46
+ method_option :skip_package, :type => :boolean, :desc => "don't package the JAR, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_package]})"
47
+ method_option :extra_hadoop_args, :type => :array, :desc => "extra arguments to on pass to hadoop (default: #{DEFAULTS[:extra_hadoop_args]})"
48
+ def run_local
49
+ check_job!
50
+ invoke(:package, [], {}) unless options.skip_package?
51
+ output_path = options[:output] || "data/#{job_config}/output"
52
+ output_path_parent = File.dirname(output_path)
53
+ if options.cleanup_before?
54
+ remove_file(output_path)
55
+ else
56
+ check_local_output!(output_path)
57
+ end
58
+ unless File.exists?(output_path_parent)
59
+ empty_directory(output_path_parent)
60
+ end
61
+ input_glob = File.join(options[:data_path], options[:input])
62
+ hadoop_config_path = options[:hadoop_config] || default_hadoop_config_path
63
+ run_command('hadoop', 'jar', project_jar, '-conf', hadoop_config_path, job_config, input_glob, output_path, *options[:extra_hadoop_args])
64
+ end
65
+
66
+ desc 'run-emr', 'run a job in Elastic MapReduce'
67
+ method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved against the data bucket'
68
+ method_option :output, :type => :string, :desc => 'the output directory, defaults to "<project_name>/<job_config>/output" in the job bucket'
69
+ method_option :job_config, :type => 'string', :desc => 'the name of the Ruby file containing the job configuration, defaults to the project name (e.g. "lib/<job_config>.rb")'
70
+ method_option :cleanup_before, :type => :boolean, :desc => "automatically remove the output dir before launching (default: #{DEFAULTS[:cleanup_before]})"
71
+ method_option :data_bucket, :type => :string, :desc => "S3 bucket containing input data (default: #{DEFAULTS[:data_bucket]})"
72
+ method_option :job_bucket, :type => :string, :desc => "S3 bucket to upload JAR, output logs and results into (default: #{DEFAULTS[:job_bucket]})"
73
+ method_option :instance_count, :type => :numeric, :desc => "the number of worker instances to launch (default: #{DEFAULTS[:instance_count]})"
74
+ method_option :instance_type, :type => :string, :desc => "the worker instance type, see http://ec2pricing.iconara.info/ for available types (default: #{DEFAULTS[:instance_type]})"
75
+ method_option :spot_instances, :type => :array, :lazy_default => [], :desc => 'use spot instances; either an explicit list of instance groups or no value to run all groups as spot instances'
76
+ method_option :bid_price, :type => :string, :desc => "how much to bid for spot instances, see http://ec2pricing.iconara.info/ for current spot prices (default: #{DEFAULTS[:bid_price]})"
77
+ method_option :poll, :type => :boolean, :desc => "poll the job's status every 10s and display (default: #{DEFAULTS[:poll]})"
78
+ method_option :skip_package, :type => :boolean, :desc => "don't package the JAR, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_package]})"
79
+ method_option :skip_prepare, :type => :boolean, :desc => "don't upload the JAR and bootstrap files, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_prepare]})"
80
+ method_option :extra_hadoop_args, :type => :array, :desc => "extra arguments to pass on to hadoop (default: #{DEFAULTS[:extra_hadoop_args]})"
81
+ method_option :ec2_key_name, :type => :string, :desc => 'The name of an EC2 key pair to enable SSH access to master node'
82
+ method_option :aws_region, :type => :string, :desc => "The AWS region where the EMR flow is to run (default: #{DEFAULTS[:aws_region]})"
83
+ method_option :hadoop_version, :type => :string, :desc => "The EMR Hadoop version to use (default: #{DEFAULTS[:hadoop_version]})"
84
+ def run_emr
85
+ check_job!
86
+ invoke(:package, [], {}) unless options.skip_package?
87
+ flow = EmrFlow.new(job_config, options[:input], job_package, emr, data_bucket, job_bucket, options[:output])
88
+ if options.cleanup_before?
89
+ say_status(:remove, flow.output_uri)
90
+ flow.cleanup!
91
+ end
92
+ unless options.skip_prepare?
93
+ say_status(:upload, flow.jar_uri)
94
+ flow.prepare!
95
+ end
96
+ say_status(:warning, "No EC2 key name configured. You will not be able to access the master node via SSH.", :yellow) unless options[:ec2_key_name]
97
+ job_flow = flow.run!(
98
+ bid_price: options[:bid_price],
99
+ instance_count: options[:instance_count],
100
+ instance_type: options[:instance_type],
101
+ spot_instances: options[:spot_instances],
102
+ extra_hadoop_args: options[:extra_hadoop_args],
103
+ ec2_key_name: options[:ec2_key_name],
104
+ hadoop_version: options[:hadoop_version]
105
+ )
106
+ File.open('.humboldtjob', 'w') { |io| io.puts(job_flow.job_flow_id) }
107
+ say_status(:started, %{EMR job flow "#{job_flow.job_flow_id}"})
108
+ end
109
+
110
+ desc 'emr-job', 'show status of the last EMR job'
111
+ def emr_job
112
+ if File.exists?('.humboldtjob')
113
+ job_flow_id = File.read('.humboldtjob').strip
114
+ job_flow = emr.job_flows[job_flow_id]
115
+ print_job_flow_extended_status(job_flow)
116
+ else
117
+ say_status(:warning, 'Could not determine last job flow ID')
118
+ end
119
+ end
120
+
121
+ desc 'emr-jobs', 'list all EMR jobs'
122
+ def emr_jobs
123
+ emr.job_flows.each do |job_flow|
124
+ print_job_flow_status(job_flow)
125
+ end
126
+ end
127
+
128
+ desc 'configure', 'Configure humboldt for the current project'
129
+ def configure
130
+ say("Please ensure you are located at the root directory of the project you are configuring.", :yellow)
131
+ configuration = options_from_config_file
132
+ say('EMR configuration', :green)
133
+ configuration[:ec2_key_name] = ask("EC2 key pair name to enable SSH access to EMR master node: [#{config_file_options_with_defaults[:ec2_key_name]}]")
134
+ configuration[:aws_region] = ask("AWS region: [#{config_file_options_with_defaults[:aws_region]}]")
135
+ configuration[:hadoop_version] = ask("Hadoop version: [#{config_file_options_with_defaults[:hadoop_version]}]")
136
+ configuration[:data_bucket] = ask("Input data S3 bucket: [#{config_file_options_with_defaults[:data_bucket]}]")
137
+ configuration[:job_bucket] = ask("Job S3 bucket (where JAR is uploaded, output logs and job output go to): [#{config_file_options_with_defaults[:job_bucket]}]")
138
+ configuration.each do |key, value|
139
+ value = configuration[key] = config_file_options_with_defaults[key] if value.empty?
140
+ configuration.delete(key) if value.empty? || value == DEFAULTS[key]
141
+ end
142
+ File.open('.humboldt.yml', 'w') { |f| YAML.dump(configuration, f) }
143
+ say('Updated .humboldt.yml', :green)
144
+ end
145
+
146
+ no_commands do
147
+ def options
148
+ @extended_options ||= Thor::CoreExt::HashWithIndifferentAccess.new(config_file_options_with_defaults.merge(super))
149
+ end
150
+ end
151
+
152
+ private
153
+
154
+ ISO_DATE_TIME = '%Y-%m-%d %H:%M:%S'.freeze
155
+
156
+ def project_jar
157
+ @project_jar ||= Dir['build/*.jar'].reject { |path| path.start_with?('build/jruby-complete') }.first
158
+ end
159
+
160
+ def job_package
161
+ @job_package ||= Rubydoop::Package.new(lib_jars: Dir[File.expand_path('../../**/*.jar', __FILE__)])
162
+ end
163
+
164
+ def job_config
165
+ options[:job_config] || job_package.project_name
166
+ end
167
+
168
+ def default_hadoop_config_path
169
+ File.expand_path('../../../config/hadoop-local.xml', __FILE__)
170
+ end
171
+
172
+ def s3
173
+ @s3 ||= AWS::S3.new
174
+ end
175
+
176
+ def emr
177
+ @emr ||= AWS::EMR.new(region: options[:aws_region])
178
+ end
179
+
180
+ def job_bucket
181
+ @job_bucket ||= s3.buckets[options[:job_bucket]]
182
+ end
183
+
184
+ def data_bucket
185
+ @data_bucket ||= s3.buckets[options[:data_bucket]]
186
+ end
187
+
188
+ def check_job!
189
+ raise Thor::Error, "No such job: #{job_config}" unless File.exists?("lib/#{job_config}.rb")
190
+ end
191
+
192
+ def relative_path(path)
193
+ path.sub(Dir.pwd + '/', '')
194
+ end
195
+
196
+ def check_local_output!(path)
197
+ if File.exists?(path)
198
+ raise Thor::Error, "#{options[:output]} already exists!"
199
+ end
200
+ end
201
+
202
+ def run_command(*args)
203
+ say_status(:running, 'Hadoop started')
204
+ Open3.popen3(*args) do |stdin, stdout, stderr, wait_thr|
205
+ stdin.close
206
+ stdout_printer = Thread.new(stdout) do |stdout|
207
+ while line = stdout.gets
208
+ say(line.chomp)
209
+ end
210
+ end
211
+ stderr_printer = Thread.new(stderr) do |stderr|
212
+ filter = HadoopStatusFilter.new(stderr, self, options.silent?)
213
+ filter.run
214
+ end
215
+ stdout_printer.join
216
+ stderr_printer.join
217
+ if wait_thr.value.exitstatus == 0
218
+ say_status(:done, 'Job completed')
219
+ else
220
+ say_status(:failed, 'Job failed', :red)
221
+ end
222
+ end
223
+ end
224
+
225
+ def print_job_flow_extended_status(job_flow)
226
+ id = job_flow.job_flow_id
227
+ state = job_flow.state
228
+ created_at = job_flow.created_at.strftime(ISO_DATE_TIME)
229
+ change_reason = job_flow.last_state_change_reason
230
+ say_status(:started, created_at)
231
+ say_status(:state, state)
232
+ say_status(:change, change_reason)
233
+ rescue => e
234
+ say_status(:error, e.message, :red)
235
+ sleep 1
236
+ retry
237
+ end
238
+
239
+ def print_job_flow_status(job_flow)
240
+ id = job_flow.job_flow_id
241
+ state = job_flow.state
242
+ created_at = job_flow.created_at.strftime(ISO_DATE_TIME)
243
+ change_reason = job_flow.last_state_change_reason
244
+ say_status(:status, sprintf('%-15s %-10s %19s %s', id, state, created_at, change_reason))
245
+ rescue => e
246
+ say_status(:error, e.message, :red)
247
+ sleep 1
248
+ retry
249
+ end
250
+
251
+ def config_file_options_with_defaults
252
+ @config_file_options_with_defaults ||= DEFAULTS.merge(options_from_config_file)
253
+ end
254
+
255
+ def options_from_config_file
256
+ @options_from_config_file ||= begin
257
+ ::YAML::load_file(".humboldt.yml")
258
+ rescue Errno::ENOENT
259
+ {}
260
+ end
261
+ end
262
+ end
263
+ end
@@ -0,0 +1,198 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ class EmrFlow
5
+ attr_reader :output_path
6
+
7
+ def initialize(*args)
8
+ @job_name, @input_glob, @package, @emr, @data_bucket, @job_bucket, @output_path = args
9
+ @output_path ||= "#{@package.project_name}/#{@job_name}/output"
10
+ end
11
+
12
+ def prepare!
13
+ upload_bootstrap_task_files!
14
+ upload_jar!
15
+ end
16
+
17
+ def cleanup!
18
+ delete_output_dir!
19
+ end
20
+
21
+ def run!(launch_options={})
22
+ check_jar!
23
+ check_output_dir!
24
+ create_flow!(launch_options)
25
+ end
26
+
27
+ def jar_path
28
+ "#{@package.project_name}/#{File.basename(@package.jar_path)}"
29
+ end
30
+
31
+ def jar_uri
32
+ s3_uri(jar_path)
33
+ end
34
+
35
+ def output_uri
36
+ s3_uri(output_path)
37
+ end
38
+
39
+ def log_path
40
+ "#{@package.project_name}/#{@job_name}/logs"
41
+ end
42
+
43
+ private
44
+
45
+ BOOTSTRAP_TASK_FILES = {
46
+ :remove_old_jruby => 'config/emr-bootstrap/remove_old_jruby.sh'
47
+ }.freeze
48
+
49
+ def s3_uri(path, options={})
50
+ protocol = options[:protocol] || 's3'
51
+ bucket = options[:bucket] || @job_bucket
52
+ "#{protocol}://#{bucket.name}/#{path}"
53
+ end
54
+
55
+ def upload_bootstrap_task_files!
56
+ BOOTSTRAP_TASK_FILES.values.each do |local_path|
57
+ remote_obj = @job_bucket.objects["#{@package.project_name}/#{local_path}"]
58
+ remote_obj.write(Pathname.new(File.expand_path(local_path, "#{__FILE__}/../../..")))
59
+ end
60
+ end
61
+
62
+ def upload_jar!
63
+ # TODO: upload only if not exists and MD5 != ETag
64
+ jar_obj = @job_bucket.objects[jar_path]
65
+ jar_obj.write(Pathname.new(@package.jar_path))
66
+ end
67
+
68
+ def check_jar!
69
+ unless @job_bucket.objects.with_prefix(jar_path).any?
70
+ raise "Job JAR missing (#{s3_uri(jar_path)}"
71
+ end
72
+ end
73
+
74
+ def check_output_dir!
75
+ if @job_bucket.objects.with_prefix(output_path).any?
76
+ raise "Output directory already exists (#{s3_uri(output_path)})"
77
+ end
78
+ end
79
+
80
+ def delete_output_dir!
81
+ @job_bucket.objects.with_prefix(output_path).delete_all
82
+ end
83
+
84
+ def job_flow_configuration(launch_options)
85
+ {
86
+ :log_uri => s3_uri(log_path),
87
+ :instances => instance_configuration(launch_options),
88
+ :steps => [step_configuration(launch_options)],
89
+ :bootstrap_actions => bootstrap_actions,
90
+ :visible_to_all_users => true
91
+ }
92
+ end
93
+
94
+ def instance_configuration(launch_options)
95
+ {
96
+ :ec2_key_name => launch_options[:ec2_key_name],
97
+ :hadoop_version => launch_options[:hadoop_version],
98
+ :instance_groups => InstanceGroupConfiguration.create(launch_options)
99
+ }
100
+ end
101
+
102
+ def bootstrap_actions
103
+ remove_old_jruby_action = {
104
+ :name => 'remove_old_jruby',
105
+ :script_bootstrap_action => {
106
+ :path => s3_uri("#{@package.project_name}/#{BOOTSTRAP_TASK_FILES[:remove_old_jruby]}")
107
+ }
108
+ }
109
+
110
+ # http://hadoop.apache.org/docs/r1.0.3/mapred-default.html
111
+ configure_hadoop_action = {
112
+ :name => 'configure_hadoop',
113
+ :script_bootstrap_action => {
114
+ :path => 's3://eu-west-1.elasticmapreduce/bootstrap-actions/configure-hadoop',
115
+ :args => [
116
+ '-m', 'mapred.job.reuse.jvm.num.tasks=-1',
117
+ '-m', 'mapred.map.tasks.speculative.execution=false',
118
+ '-m', 'mapred.reduce.tasks.speculative.execution=false'
119
+ ]
120
+ }
121
+ }
122
+
123
+ [remove_old_jruby_action, configure_hadoop_action]
124
+ end
125
+
126
+ def step_configuration(launch_options)
127
+ {
128
+ :name => @package.project_name,
129
+ :hadoop_jar_step => {
130
+ :jar => s3_uri(jar_path),
131
+ :args => [
132
+ @job_name,
133
+ s3_uri(@input_glob, protocol: 's3n', bucket: @data_bucket),
134
+ s3_uri(output_path, protocol: 's3n'),
135
+ *launch_options[:extra_hadoop_args]
136
+ ]
137
+ }
138
+ }
139
+ end
140
+
141
+ def create_flow!(launch_options)
142
+ job_flow = @emr.job_flows.create(@package.project_name, job_flow_configuration(launch_options))
143
+ end
144
+
145
+ module InstanceGroupConfiguration
146
+ extend self
147
+
148
+ # TODO: add 'task' group when support is added for 'tasks'
149
+ INSTANCE_GROUPS = %w[master core].freeze
150
+ MASTER_INSTANCE_TYPE = 'm1.small'.freeze
151
+ DEFAULT_CORE_INSTANCE_TYPE = 'c1.xlarge'.freeze
152
+ DEFAULT_BID_PRICE = '0.2'.freeze
153
+ DEFAULT_CORE_INSTANCE_COUNT = 4
154
+
155
+ INSTANCE_TYPE_MAPPINGS = {
156
+ 'master' => MASTER_INSTANCE_TYPE,
157
+ 'core' => DEFAULT_CORE_INSTANCE_TYPE
158
+ }.freeze
159
+
160
+ INSTANCE_COUNT_MAPPINGS = {
161
+ 'master' => 1,
162
+ 'core' => DEFAULT_CORE_INSTANCE_COUNT
163
+ }.freeze
164
+
165
+ def base_configuration(group)
166
+ {:name => "#{group.capitalize} Group", :instance_role => group.upcase}
167
+ end
168
+
169
+ def configure_type_and_count(group, configuration, options = {})
170
+ if group == 'core'
171
+ configuration[:instance_type] = options[:instance_type]
172
+ configuration[:instance_count] = options[:instance_count]
173
+ end
174
+
175
+ configuration[:instance_type] ||= INSTANCE_TYPE_MAPPINGS[group]
176
+ configuration[:instance_count] ||= INSTANCE_COUNT_MAPPINGS[group]
177
+ end
178
+
179
+ def configure_market(group, configuration, spot_instances, bid_price)
180
+ if spot_instances && (spot_instances.empty? || spot_instances.include?(group))
181
+ configuration[:market] = 'SPOT'
182
+ configuration[:bid_price] = bid_price || DEFAULT_BID_PRICE
183
+ else
184
+ configuration[:market] = 'ON_DEMAND'
185
+ end
186
+ end
187
+
188
+ def create(options)
189
+ INSTANCE_GROUPS.map do |group|
190
+ configuration = base_configuration(group)
191
+ configure_type_and_count(group, configuration, options)
192
+ configure_market(group, configuration, options[:spot_instances], options[:bid_price])
193
+ configuration
194
+ end
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,97 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ class HadoopStatusFilter
5
+ def initialize(hadoop_stderr, shell, silent)
6
+ @hadoop_stderr = hadoop_stderr
7
+ @shell = shell
8
+ @silent = silent
9
+ @counters = {}
10
+ end
11
+
12
+ def run
13
+ counter_group = nil
14
+ while line = @hadoop_stderr.gets
15
+ if @counters_printing && (hadoop_log?(line) || line =~ /^\t+/)
16
+ case line.chomp
17
+ when /(?:JobClient: |\t+)([^\t]+)=(\d+)$/
18
+ if counter_group
19
+ @counters[counter_group] ||= {}
20
+ @counters[counter_group][$1.strip] = $2.to_i
21
+ end
22
+ when /(?:JobClient: |\t+)([^\t]+)$/
23
+ counter_group = $1.strip
24
+ end
25
+ elsif @error_printing && !hadoop_log?(line) && !ignore?(line)
26
+ report_error(line)
27
+ elsif ignore?(line)
28
+ # do nothing
29
+ else
30
+ @counters_printing = false
31
+ @error_printing = false
32
+ case line
33
+ when /map (\d+)% reduce (\d+)%/
34
+ report_progress($1, $2)
35
+ when /Counters: \d+/
36
+ @counters_printing = true
37
+ else
38
+ unless hadoop_log?(line)
39
+ @error_printing = true
40
+ if line =~ /warning(!|:)/i
41
+ @error_type = :warning
42
+ else
43
+ @error_type = :error
44
+ end
45
+ report_error(line)
46
+ end
47
+ end
48
+ end
49
+ @shell.say(line.chomp, :red) unless @silent
50
+ end
51
+ print_counters_table
52
+ end
53
+
54
+ private
55
+
56
+ def hadoop_log?(line)
57
+ line =~ /(?:INFO|WARN) (?:mapred|input|output|util|jvm|mapreduce)\./
58
+ end
59
+
60
+ def ignore?(line)
61
+ case line
62
+ when /^\s*$/,
63
+ /Warning: \$HADOOP_HOME is deprecated/,
64
+ /Unable to load realm info from SCDynamicStore/,
65
+ /Unable to load native-hadoop library/,
66
+ /Snappy native library not loaded/,
67
+ /Configuration.deprecation:/,
68
+ /WARN conf.Configuration.*attempt to override final parameter.*ignoring/i
69
+ true
70
+ else
71
+ false
72
+ end
73
+ end
74
+
75
+ def report_progress(map, reduce)
76
+ @shell.say_status(:progress, "map #{map}%, reduce #{reduce}%")
77
+ end
78
+
79
+ def report_error(line)
80
+ @shell.say_status(@error_type, line.chomp, @error_type == :error ? :red : :yellow)
81
+ end
82
+
83
+ def print_counters_table
84
+ table = @counters.flat_map do |group, counters|
85
+ [
86
+ [group, *counters.first],
87
+ *counters.drop(1).map { |counter, value| ['', counter, value] },
88
+ ['', '', '']
89
+ ]
90
+ end
91
+ table.pop
92
+ @shell.say
93
+ @shell.print_table(table)
94
+ @shell.say
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,5 @@
1
+ module Humboldt
2
+ module JavaLib
3
+ include_package 'humboldt'
4
+ end
5
+ end
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ class Mapper < Processor
5
+ class << self
6
+ def map(&block)
7
+ define_method(:map) do |key, value, context|
8
+ @in_key.hadoop = key
9
+ @in_value.hadoop = value
10
+ instance_exec(@in_key.ruby, @in_value.ruby, &block)
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ module Patterns
5
+ class SumReducer < Reducer
6
+ input :text, :long
7
+ output :text, :long
8
+
9
+ reduce do |key, values|
10
+ sum = 0
11
+ values.each { |v| sum += v }
12
+ emit(key, sum)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: utf-8
2
+
3
+ require 'zlib'
4
+
5
+
6
+ module Humboldt
7
+ class BinaryPrefixPartitioner
8
+ def initialize(cutoff_index)
9
+ @cutoff_index = cutoff_index
10
+ end
11
+
12
+ def partition(key, value, num_partitions)
13
+ length = @cutoff_index > key.length ? key.length : @cutoff_index
14
+ prefix = String.from_java_bytes(key.bytes)[0, length]
15
+ Zlib.crc32(prefix) % num_partitions
16
+ end
17
+ end
18
+
19
+ class DropBinaryPrefixPartitioner < BinaryPrefixPartitioner
20
+ def partition(key, value, num_partitions)
21
+ length = key.length > @cutoff_index ? key.length - @cutoff_index : 0
22
+ prefix = String.from_java_bytes(key.bytes)[0, length]
23
+ Zlib.crc32(prefix) % num_partitions
24
+ end
25
+ end
26
+
27
+ class BinaryPrefixComparator
28
+ def initialize(cutoff_index)
29
+ @cutoff_index = cutoff_index
30
+ end
31
+
32
+ def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
33
+ subset_length1 = @cutoff_index > length1 ? length1 : @cutoff_index
34
+ subset_length2 = @cutoff_index > length2 ? length2 : @cutoff_index
35
+ ::Hadoop::Io::WritableComparator.compareBytes(bytes1, start1, subset_length1, bytes2, start2, subset_length2)
36
+ end
37
+ end
38
+
39
+ class DropBinaryPrefixComparator < BinaryPrefixComparator
40
+ def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
41
+ subset_length1 = length1 - @cutoff_index
42
+ subset_length2 = length2 - @cutoff_index
43
+ ::Hadoop::Io::WritableComparator.compareBytes(bytes1, start1, subset_length1, bytes2, start2, subset_length2)
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,96 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ class Processor
5
+ class << self
6
+ def self.type_accessor(*names)
7
+ names.each do |name|
8
+ module_eval <<-EOA
9
+ def #{name}
10
+ @#{name} || superclass.#{name}
11
+ end
12
+ def #{name}=(type)
13
+ @#{name} = TypeConverter[type]
14
+ define_method(:#{name}_accessor) do
15
+ TypeConverter[type].new
16
+ end
17
+ end
18
+ EOA
19
+ end
20
+ end
21
+
22
+ type_accessor :input_key, :input_value, :output_key, :output_value
23
+
24
+ def input(*types)
25
+ self.input_key = types.first
26
+ self.input_value = types.last
27
+ end
28
+
29
+ def output(*types)
30
+ self.output_key = types.first
31
+ self.output_value = types.last
32
+ end
33
+
34
+ def setup(&block)
35
+ define_method(:instance_setup, &block)
36
+ private(:instance_setup)
37
+ end
38
+
39
+ def cleanup(&block)
40
+ define_method(:instance_cleanup, &block)
41
+ private(:instance_cleanup)
42
+ end
43
+ end
44
+
45
+ attr_reader :current_context
46
+
47
+ def setup(context)
48
+ @current_context = context
49
+ @in_key = input_key_accessor
50
+ @in_value = input_value_accessor
51
+ @out_key = output_key_accessor
52
+ @out_value = output_value_accessor
53
+ unless Hadoop::Mapreduce::Job.instance_methods.include?(:add_cache_file)
54
+ create_symlinks!
55
+ end
56
+ instance_setup
57
+ end
58
+
59
+ def cleanup(context)
60
+ instance_cleanup
61
+ end
62
+
63
+ protected
64
+
65
+ def emit(key, value)
66
+ @out_key.ruby = key
67
+ @out_value.ruby = value
68
+ @current_context.write(@out_key.hadoop, @out_value.hadoop)
69
+ end
70
+
71
+ private
72
+
73
+ def instance_setup
74
+ end
75
+
76
+ def instance_cleanup
77
+ end
78
+
79
+ def create_symlinks!
80
+ distributed_cache = ::Hadoop::FileCache::DistributedCache
81
+ files = distributed_cache.get_cache_files(@current_context.configuration)
82
+ local_files = distributed_cache.get_local_cache_files(@current_context.configuration)
83
+ if files && local_files
84
+ work_dir = ENV['HADOOP_WORK_DIR']
85
+ files.each_with_index do |file, i|
86
+ target = local_files[i].to_s
87
+ link_path = File.join(work_dir, file.fragment)
88
+ FileUtils.mkdir_p(File.dirname(link_path))
89
+ unless File.exists?(link_path)
90
+ FileUtils.ln_s(target, link_path)
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ class Reducer < Processor
5
+ class << self
6
+ def reduce(&block)
7
+ define_method(:reduce) do |key, values, context|
8
+ @in_key.hadoop = key
9
+ values_enumerator = TypeConversionEnumerator.new(@in_value, values.iterator)
10
+ instance_exec(@in_key.ruby, values_enumerator, &block)
11
+ end
12
+ end
13
+ end
14
+
15
+ class TypeConversionEnumerator < Enumerator
16
+ def initialize(*args)
17
+ @value_converter, @hadoop_iterator = args
18
+ end
19
+
20
+ def each
21
+ while @hadoop_iterator.has_next
22
+ @value_converter.hadoop = @hadoop_iterator.next
23
+ yield @value_converter.ruby
24
+ end
25
+ end
26
+
27
+ def next
28
+ raise StopIteration unless @hadoop_iterator.has_next
29
+ @value_converter.hadoop = @hadoop_iterator.next
30
+ @value_converter.ruby
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,100 @@
1
+ # encoding: utf-8
2
+
3
+ require 'humboldt'
4
+
5
+
6
+ module RunnerHelpers
7
+ def run_nokey_mapper(mapper, *values, &context_callback)
8
+ key = mapper.input_key_accessor.ruby
9
+ args = values.map { |value| [key, value] }
10
+ run_mapper(mapper, *args, &context_callback)
11
+ end
12
+
13
+ def run_mapper(mapper, *entries, &context_callback)
14
+ in_value = mapper.input_value_accessor
15
+ run(mapper, :map, context_callback, *entries) do |value|
16
+ in_value.ruby = value
17
+ in_value.hadoop
18
+ end
19
+ end
20
+
21
+ def run_reducer(reducer, *entries, &context_callback)
22
+ run(reducer, :reduce, context_callback, *entries) do |value|
23
+ fake_iterator(*value.map do |v|
24
+ in_value = reducer.input_value_accessor
25
+ in_value.ruby = v
26
+ in_value.hadoop
27
+ end)
28
+ end
29
+ end
30
+
31
+ def run(runner, method, context_callback, *entries)
32
+ in_key = runner.input_key_accessor
33
+ context = FakeContext.new(runner.output_key_accessor, runner.output_value_accessor)
34
+ context_callback.call(context) if context_callback
35
+ runner.setup(context)
36
+ entries.each do |entry|
37
+ in_key.ruby = entry.first
38
+ runner.send(method, in_key.hadoop, yield(entry.last), context)
39
+ end
40
+ runner.cleanup(context)
41
+ context.results
42
+ end
43
+
44
+ def fake_iterator(*values)
45
+ FakeIterable.new(values)
46
+ end
47
+
48
+ class FakeIterable
49
+ def initialize(values)
50
+ @values = values
51
+ end
52
+ def iterator
53
+ FakeIterator.new(@values.dup)
54
+ end
55
+ end
56
+
57
+ class FakeIterator
58
+ def initialize(values)
59
+ @values = values
60
+ end
61
+ def has_next
62
+ !@values.empty?
63
+ end
64
+ def next
65
+ @values.shift
66
+ end
67
+ end
68
+
69
+ class FakeContext
70
+ attr_reader :results, :counters
71
+
72
+ def initialize(key_accessor, value_accessor)
73
+ @key_accessor, @value_accessor = key_accessor, value_accessor
74
+ @results = []
75
+ @counters = Hash.new { |h,k| h[k] = Hash.new { |h2,k2| h2[k2] = 0 } }
76
+ end
77
+
78
+ def write(key, value)
79
+ @key_accessor.hadoop = key
80
+ @value_accessor.hadoop = value
81
+ @results << [@key_accessor.ruby, @value_accessor.ruby]
82
+ end
83
+
84
+ def configuration
85
+ @configuration ||= ::Hadoop::Conf::Configuration.new.tap do |config|
86
+ config.set 'mapred.job.tracker', 'local'
87
+ end
88
+ end
89
+
90
+ def get_counter(group, name)
91
+ FakeCounter.new do |amount|
92
+ @counters[group][name] += amount
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+ RSpec.configure do |conf|
99
+ conf.include(RunnerHelpers)
100
+ end
@@ -0,0 +1,180 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ module TypeConverter
5
+ class Binary
6
+ HADOOP = ::Hadoop::Io::BytesWritable
7
+ RUBY = ::String
8
+
9
+ attr_reader :hadoop
10
+
11
+ def hadoop=(value)
12
+ unless value.is_a?(HADOOP)
13
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
14
+ end
15
+ @hadoop = value
16
+ end
17
+
18
+ def initialize
19
+ @hadoop = HADOOP.new
20
+ end
21
+
22
+ def ruby
23
+ String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length)
24
+ end
25
+
26
+ def ruby=(value)
27
+ unless value.is_a?(RUBY)
28
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
29
+ end
30
+
31
+ @hadoop.set(value.to_java_bytes, 0, value.bytesize)
32
+ end
33
+ end
34
+
35
+ begin
36
+ require 'msgpack'
37
+
38
+ class Encoded < Binary
39
+ def ruby=(value)
40
+ unless value.is_a?(Hash) || value.is_a?(Array)
41
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected Hash or Array"
42
+ end
43
+ packed = MessagePack.pack(value)
44
+ @hadoop.set(packed.to_java_bytes, 0, packed.bytesize)
45
+ end
46
+
47
+ def ruby
48
+ packed = String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length)
49
+ MessagePack.unpack(packed, encoding: Encoding::UTF_8)
50
+ end
51
+ end
52
+ rescue LoadError
53
+ end
54
+
55
+ class Text
56
+ HADOOP = ::Hadoop::Io::Text
57
+ RUBY = ::String
58
+
59
+ attr_reader :hadoop
60
+
61
+ def hadoop=(value)
62
+ unless value.is_a?(HADOOP)
63
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
64
+ end
65
+ @hadoop = value
66
+ end
67
+
68
+ def initialize
69
+ @hadoop = HADOOP.new
70
+ end
71
+
72
+ def ruby
73
+ String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length).force_encoding(Encoding::UTF_8)
74
+ end
75
+
76
+ def ruby=(value)
77
+ unless value.is_a?(RUBY)
78
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
79
+ end
80
+
81
+ if value.encoding == Encoding::UTF_8
82
+ @hadoop.set(value.to_java_bytes, 0, value.bytesize)
83
+ else
84
+ @hadoop.set(value)
85
+ end
86
+ end
87
+ end
88
+
89
+ begin
90
+ require 'json'
91
+
92
+ class Json < Text
93
+ def ruby=(value)
94
+ unless value.is_a?(Hash) || value.is_a?(Array)
95
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected Hash or Array"
96
+ end
97
+ @hadoop.set(JSON.generate(value))
98
+ end
99
+
100
+ def ruby
101
+ JSON.parse(hadoop.to_s)
102
+ end
103
+ end
104
+ end
105
+
106
+ class Long
107
+ HADOOP = ::Hadoop::Io::LongWritable
108
+ RUBY = ::Integer
109
+
110
+ attr_reader :hadoop
111
+
112
+ def hadoop=(value)
113
+ unless value.is_a?(HADOOP)
114
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
115
+ end
116
+ @hadoop = value
117
+ end
118
+
119
+ def initialize
120
+ @hadoop = HADOOP.new
121
+ end
122
+
123
+ def ruby
124
+ @hadoop.get
125
+ end
126
+
127
+ def ruby=(value)
128
+ unless value.is_a?(Integer)
129
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
130
+ end
131
+
132
+ @hadoop.set value
133
+ end
134
+ end
135
+
136
+ class None
137
+ HADOOP = ::Hadoop::Io::NullWritable
138
+ RUBY = ::NilClass
139
+
140
+ def hadoop
141
+ HADOOP.get
142
+ end
143
+
144
+ def hadoop=(value)
145
+ unless value.is_a?(HADOOP)
146
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
147
+ end
148
+ end
149
+
150
+ def ruby
151
+ nil
152
+ end
153
+
154
+ def ruby=(value)
155
+ unless value.nil?
156
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
157
+ end
158
+ end
159
+ end
160
+
161
+ TYPE_CONVERTER_CLASS_CACHE = Hash.new { |h,k| h[k] = const_get(k.to_s.capitalize) }
162
+
163
+ def self.[](name)
164
+ TYPE_CONVERTER_CLASS_CACHE[name]
165
+ end
166
+
167
+ FROM_HADOOP_MAPPINGS = {
168
+ ::Hadoop::Io::Text => Text,
169
+ ::Hadoop::Io::BytesWritable => Binary,
170
+ ::Hadoop::Io::LongWritable => Long,
171
+ ::Hadoop::Io::NullWritable => None
172
+ }.freeze
173
+
174
+ def self.from_hadoop(hadoop_class)
175
+ accessor = FROM_HADOOP_MAPPINGS[hadoop_class]
176
+ raise ArgumentError, "Unsupported Hadoop type: #{hadoop_class}" unless accessor
177
+ accessor
178
+ end
179
+ end
180
+ end
@@ -0,0 +1,5 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ VERSION = '1.0.0'.freeze
5
+ end
data/lib/humboldt.jar ADDED
Binary file
data/lib/humboldt.rb ADDED
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+
3
+ require 'fileutils'
4
+ require 'rubydoop'
5
+ require 'hadoop'
6
+
7
+ require 'humboldt/java_lib'
8
+
9
+ require 'ext/hadoop'
10
+ require 'ext/rubydoop'
11
+
12
+ require 'humboldt/type_converters'
13
+ require 'humboldt/processor'
14
+ require 'humboldt/mapper'
15
+ require 'humboldt/reducer'
16
+ require 'humboldt/prefix_grouping'
metadata ADDED
@@ -0,0 +1,112 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: humboldt
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: java
6
+ authors:
7
+ - The Burt Platform Team
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - '>='
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ name: thor
20
+ prerelease: false
21
+ type: :runtime
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 1.1.2
33
+ name: rubydoop
34
+ prerelease: false
35
+ type: :runtime
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 1.1.2
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - '>='
45
+ - !ruby/object:Gem::Version
46
+ version: 1.16.0
47
+ - - <
48
+ - !ruby/object:Gem::Version
49
+ version: 1.33.0
50
+ name: aws-sdk
51
+ prerelease: false
52
+ type: :runtime
53
+ version_requirements: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - '>='
56
+ - !ruby/object:Gem::Version
57
+ version: 1.16.0
58
+ - - <
59
+ - !ruby/object:Gem::Version
60
+ version: 1.33.0
61
+ description: Humboldt provides a mapreduce API abstraction built on top of Rubydoop, and tools to run Hadoop jobs effortlessly both locally and on Amazon EMR
62
+ email:
63
+ - theo@burtcorp.com
64
+ executables:
65
+ - humboldt
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - bin/humboldt
70
+ - config/emr-bootstrap/remove_old_jruby.sh
71
+ - config/hadoop-local.xml
72
+ - lib/ext/hadoop.rb
73
+ - lib/ext/rubydoop.rb
74
+ - lib/humboldt.jar
75
+ - lib/humboldt.rb
76
+ - lib/humboldt/cli.rb
77
+ - lib/humboldt/emr_flow.rb
78
+ - lib/humboldt/hadoop_status_filter.rb
79
+ - lib/humboldt/java_lib.rb
80
+ - lib/humboldt/mapper.rb
81
+ - lib/humboldt/patterns/sum_reducer.rb
82
+ - lib/humboldt/prefix_grouping.rb
83
+ - lib/humboldt/processor.rb
84
+ - lib/humboldt/reducer.rb
85
+ - lib/humboldt/rspec.rb
86
+ - lib/humboldt/type_converters.rb
87
+ - lib/humboldt/version.rb
88
+ homepage: http://github.com/burtcorp/humboldt
89
+ licenses:
90
+ - BSD-3-Clause
91
+ metadata: {}
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - '>='
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - '>='
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubyforge_project:
108
+ rubygems_version: 2.2.2
109
+ signing_key:
110
+ specification_version: 4
111
+ summary: Tools and libraries for simplifying running Rubydoop jobs locally and on AWS Elastic MapReduce
112
+ test_files: []