humboldt 1.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7ca8c0825572a27fc4673c042f4d5677e250340a
4
+ data.tar.gz: 910d598f4df79e42eda1bbf4d022db9a160c48c3
5
+ SHA512:
6
+ metadata.gz: 6cf04eb473b93684dfde74f56ed6113d9c8d15fbfcc7686b33d878570a804ee5c29c11e0fdc29692b689f9b5039e8f0a21a93781d669c2ae4d4dc8384f7d436f
7
+ data.tar.gz: 84d6fab2395c97ebda5eac18684371ee475ee336823a4cb936f11196b8905911436af418bfb3386649c6c3c2097095c38a1aab2ada827f56e33f8020c50e6f38
data/bin/humboldt ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.expand_path('../../lib', __FILE__)
4
+
5
+ require 'humboldt/cli'
6
+
7
+
8
+ Humboldt::Cli.start
@@ -0,0 +1,11 @@
1
+ #!/bin/bash
2
+
3
+ if [ -e /home/hadoop/lib/jruby-complete-no-joda-1.6.5.jar ]
4
+ then
5
+ rm /home/hadoop/lib/jruby-complete-no-joda-1.6.5.jar
6
+ fi
7
+
8
+ if [ -e /home/hadoop/lib/jruby-complete-1.6.8.jar ]
9
+ then
10
+ rm /home/hadoop/lib/jruby-complete-1.6.8.jar
11
+ fi
@@ -0,0 +1,12 @@
1
+ <?xml version="1.0"?>
2
+
3
+ <configuration>
4
+ <property>
5
+ <name>fs.default.name</name>
6
+ <value>file:///</value>
7
+ </property>
8
+ <property>
9
+ <name>mapred.job.tracker</name>
10
+ <value>local</value>
11
+ </property>
12
+ </configuration>
data/lib/ext/hadoop.rb ADDED
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+
3
+ module Hadoop
4
+ module FileCache
5
+ include_package 'org.apache.hadoop.filecache'
6
+ end
7
+ module Conf
8
+ include_package 'org.apache.hadoop.conf'
9
+ end
10
+ end
@@ -0,0 +1,60 @@
1
+ # encoding: utf-8
2
+
3
+ module Rubydoop
4
+ class JobDefinition
5
+ alias mapperrr mapper
6
+ def mapper(cls)
7
+ map_output_key cls.output_key.const_get(:HADOOP) if cls.respond_to?(:output_key)
8
+ map_output_value cls.output_value.const_get(:HADOOP) if cls.respond_to?(:output_value)
9
+ mapperrr cls
10
+ end
11
+
12
+ alias reducerrr reducer
13
+ def reducer(cls)
14
+ output_key cls.output_key.const_get(:HADOOP) if cls.respond_to?(:output_key)
15
+ output_value cls.output_value.const_get(:HADOOP) if cls.respond_to?(:output_value)
16
+ reducerrr cls
17
+ end
18
+
19
+ alias inputtt input
20
+ def input(paths, options={})
21
+ options = options.dup
22
+ format = options[:format]
23
+ STDERR.puts "Warning! Using `format: :combined_text` will not work with remote input paths (e.g. S3) and Hadoop 1.x. Cf. https://issues.apache.org/jira/browse/MAPREDUCE-1806" if format == :combined_text
24
+ unless format.nil? or format.is_a?(Class)
25
+ class_name = format.to_s.gsub(/^.|_./) {|x| x[-1,1].upcase } + "InputFormat"
26
+ begin
27
+ options[:format] = Humboldt::JavaLib.const_get(class_name)
28
+ rescue NameError
29
+ end
30
+ end
31
+ inputtt(paths, options)
32
+ end
33
+
34
+ def enable_compression!
35
+ unless local_mode?
36
+ set 'mapred.compress.map.output', true
37
+ set 'mapred.output.compress', true
38
+ set 'mapred.map.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
39
+ set 'mapred.output.compression.codec', 'org.apache.hadoop.io.compress.GzipCodec'
40
+ set 'mapred.output.compression.type', 'BLOCK'
41
+ end
42
+ end
43
+
44
+ def local_mode?
45
+ @job.configuration.get('mapred.job.tracker') == 'local'
46
+ end
47
+
48
+ def cache_file(file, options = {})
49
+ symlink = options.fetch(:as, File.basename(file))
50
+ if local_mode? && !Hadoop::Mapreduce::Job.instance_methods.include?(:add_cache_file)
51
+ unless File.symlink?(symlink) && File.readlink(symlink) == file
52
+ FileUtils.ln_s file, symlink
53
+ end
54
+ else
55
+ uri = java.net.URI.new("#{file}\##{symlink}")
56
+ Hadoop::FileCache::DistributedCache.add_cache_file(uri, @job.configuration)
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,263 @@
1
+ # encoding: utf-8
2
+
3
+ require 'thor'
4
+ require 'aws'
5
+ require 'open3'
6
+ require 'rubydoop/package' # this prints an annoying warning in JRuby 1.7.0.RC1
7
+ require 'humboldt/emr_flow'
8
+ require 'humboldt/hadoop_status_filter'
9
+
10
+
11
+
12
+ module Humboldt
13
+ class Cli < Thor
14
+ include Thor::Actions
15
+
16
+ DEFAULTS = {
17
+ data_path: 'data/completes',
18
+ silent: true,
19
+ skip_package: false,
20
+ extra_hadoop_args: [],
21
+ cleanup_before: false,
22
+ instance_count: 4,
23
+ instance_type: 'c1.xlarge',
24
+ spot_instances: nil,
25
+ bid_price: 0.2,
26
+ poll: false,
27
+ skip_prepare: false,
28
+ aws_region: 'eu-west-1',
29
+ hadoop_version: '1.0.3'
30
+ }
31
+
32
+ desc 'package', 'Package job JAR file'
33
+ def package
34
+ say_status(:package, relative_path(job_package.jar_path))
35
+ job_package.create!
36
+ end
37
+
38
+ desc 'run-local', 'run a job in local mode with the hadoop command'
39
+ method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved agains the data path'
40
+ method_option :output, :type => :string, :desc => 'the output directory, defaults to "data/<job_config>/output"'
41
+ method_option :job_config, :type => 'string', :desc => 'the name of the Ruby file containing the job configuration, defaults to the project name (e.g. "lib/<job_config>.rb")'
42
+ method_option :hadoop_config, :type => 'string', :desc => 'the path to a Hadoop configuration XML file, defaults to Humboldt-provided config that runs Hadoop in local-mode'
43
+ method_option :cleanup_before, :type => :boolean, :desc => "automatically remove the output dir before launching (default: #{DEFAULTS[:cleanup_before]})"
44
+ method_option :data_path, :type => :string, :desc => "input paths will be resolved against this path (default: #{DEFAULTS[:data_path]})"
45
+ method_option :silent, :type => :boolean, :desc => "silence the hadoop command's logging (default: #{DEFAULTS[:silent]})"
46
+ method_option :skip_package, :type => :boolean, :desc => "don't package the JAR, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_package]})"
47
+ method_option :extra_hadoop_args, :type => :array, :desc => "extra arguments to on pass to hadoop (default: #{DEFAULTS[:extra_hadoop_args]})"
48
+ def run_local
49
+ check_job!
50
+ invoke(:package, [], {}) unless options.skip_package?
51
+ output_path = options[:output] || "data/#{job_config}/output"
52
+ output_path_parent = File.dirname(output_path)
53
+ if options.cleanup_before?
54
+ remove_file(output_path)
55
+ else
56
+ check_local_output!(output_path)
57
+ end
58
+ unless File.exists?(output_path_parent)
59
+ empty_directory(output_path_parent)
60
+ end
61
+ input_glob = File.join(options[:data_path], options[:input])
62
+ hadoop_config_path = options[:hadoop_config] || default_hadoop_config_path
63
+ run_command('hadoop', 'jar', project_jar, '-conf', hadoop_config_path, job_config, input_glob, output_path, *options[:extra_hadoop_args])
64
+ end
65
+
66
+ desc 'run-emr', 'run a job in Elastic MapReduce'
67
+ method_option :input, :type => :string, :required => true, :desc => 'input glob, will be resolved against the data bucket'
68
+ method_option :output, :type => :string, :desc => 'the output directory, defaults to "<project_name>/<job_config>/output" in the job bucket'
69
+ method_option :job_config, :type => 'string', :desc => 'the name of the Ruby file containing the job configuration, defaults to the project name (e.g. "lib/<job_config>.rb")'
70
+ method_option :cleanup_before, :type => :boolean, :desc => "automatically remove the output dir before launching (default: #{DEFAULTS[:cleanup_before]})"
71
+ method_option :data_bucket, :type => :string, :desc => "S3 bucket containing input data (default: #{DEFAULTS[:data_bucket]})"
72
+ method_option :job_bucket, :type => :string, :desc => "S3 bucket to upload JAR, output logs and results into (default: #{DEFAULTS[:job_bucket]})"
73
+ method_option :instance_count, :type => :numeric, :desc => "the number of worker instances to launch (default: #{DEFAULTS[:instance_count]})"
74
+ method_option :instance_type, :type => :string, :desc => "the worker instance type, see http://ec2pricing.iconara.info/ for available types (default: #{DEFAULTS[:instance_type]})"
75
+ method_option :spot_instances, :type => :array, :lazy_default => [], :desc => 'use spot instances; either an explicit list of instance groups or no value to run all groups as spot instances'
76
+ method_option :bid_price, :type => :string, :desc => "how much to bid for spot instances, see http://ec2pricing.iconara.info/ for current spot prices (default: #{DEFAULTS[:bid_price]})"
77
+ method_option :poll, :type => :boolean, :desc => "poll the job's status every 10s and display (default: #{DEFAULTS[:poll]})"
78
+ method_option :skip_package, :type => :boolean, :desc => "don't package the JAR, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_package]})"
79
+ method_option :skip_prepare, :type => :boolean, :desc => "don't upload the JAR and bootstrap files, use only if you haven't changed anything since the last run (default: #{DEFAULTS[:skip_prepare]})"
80
+ method_option :extra_hadoop_args, :type => :array, :desc => "extra arguments to pass on to hadoop (default: #{DEFAULTS[:extra_hadoop_args]})"
81
+ method_option :ec2_key_name, :type => :string, :desc => 'The name of an EC2 key pair to enable SSH access to master node'
82
+ method_option :aws_region, :type => :string, :desc => "The AWS region where the EMR flow is to run (default: #{DEFAULTS[:aws_region]})"
83
+ method_option :hadoop_version, :type => :string, :desc => "The EMR Hadoop version to use (default: #{DEFAULTS[:hadoop_version]})"
84
+ def run_emr
85
+ check_job!
86
+ invoke(:package, [], {}) unless options.skip_package?
87
+ flow = EmrFlow.new(job_config, options[:input], job_package, emr, data_bucket, job_bucket, options[:output])
88
+ if options.cleanup_before?
89
+ say_status(:remove, flow.output_uri)
90
+ flow.cleanup!
91
+ end
92
+ unless options.skip_prepare?
93
+ say_status(:upload, flow.jar_uri)
94
+ flow.prepare!
95
+ end
96
+ say_status(:warning, "No EC2 key name configured. You will not be able to access the master node via SSH.", :yellow) unless options[:ec2_key_name]
97
+ job_flow = flow.run!(
98
+ bid_price: options[:bid_price],
99
+ instance_count: options[:instance_count],
100
+ instance_type: options[:instance_type],
101
+ spot_instances: options[:spot_instances],
102
+ extra_hadoop_args: options[:extra_hadoop_args],
103
+ ec2_key_name: options[:ec2_key_name],
104
+ hadoop_version: options[:hadoop_version]
105
+ )
106
+ File.open('.humboldtjob', 'w') { |io| io.puts(job_flow.job_flow_id) }
107
+ say_status(:started, %{EMR job flow "#{job_flow.job_flow_id}"})
108
+ end
109
+
110
+ desc 'emr-job', 'show status of the last EMR job'
111
+ def emr_job
112
+ if File.exists?('.humboldtjob')
113
+ job_flow_id = File.read('.humboldtjob').strip
114
+ job_flow = emr.job_flows[job_flow_id]
115
+ print_job_flow_extended_status(job_flow)
116
+ else
117
+ say_status(:warning, 'Could not determine last job flow ID')
118
+ end
119
+ end
120
+
121
+ desc 'emr-jobs', 'list all EMR jobs'
122
+ def emr_jobs
123
+ emr.job_flows.each do |job_flow|
124
+ print_job_flow_status(job_flow)
125
+ end
126
+ end
127
+
128
+ desc 'configure', 'Configure humboldt for the current project'
129
+ def configure
130
+ say("Please ensure you are located at the root directory of the project you are configuring.", :yellow)
131
+ configuration = options_from_config_file
132
+ say('EMR configuration', :green)
133
+ configuration[:ec2_key_name] = ask("EC2 key pair name to enable SSH access to EMR master node: [#{config_file_options_with_defaults[:ec2_key_name]}]")
134
+ configuration[:aws_region] = ask("AWS region: [#{config_file_options_with_defaults[:aws_region]}]")
135
+ configuration[:hadoop_version] = ask("Hadoop version: [#{config_file_options_with_defaults[:hadoop_version]}]")
136
+ configuration[:data_bucket] = ask("Input data S3 bucket: [#{config_file_options_with_defaults[:data_bucket]}]")
137
+ configuration[:job_bucket] = ask("Job S3 bucket (where JAR is uploaded, output logs and job output go to): [#{config_file_options_with_defaults[:job_bucket]}]")
138
+ configuration.each do |key, value|
139
+ value = configuration[key] = config_file_options_with_defaults[key] if value.empty?
140
+ configuration.delete(key) if value.empty? || value == DEFAULTS[key]
141
+ end
142
+ File.open('.humboldt.yml', 'w') { |f| YAML.dump(configuration, f) }
143
+ say('Updated .humboldt.yml', :green)
144
+ end
145
+
146
+ no_commands do
147
+ def options
148
+ @extended_options ||= Thor::CoreExt::HashWithIndifferentAccess.new(config_file_options_with_defaults.merge(super))
149
+ end
150
+ end
151
+
152
+ private
153
+
154
+ ISO_DATE_TIME = '%Y-%m-%d %H:%M:%S'.freeze
155
+
156
+ def project_jar
157
+ @project_jar ||= Dir['build/*.jar'].reject { |path| path.start_with?('build/jruby-complete') }.first
158
+ end
159
+
160
+ def job_package
161
+ @job_package ||= Rubydoop::Package.new(lib_jars: Dir[File.expand_path('../../**/*.jar', __FILE__)])
162
+ end
163
+
164
+ def job_config
165
+ options[:job_config] || job_package.project_name
166
+ end
167
+
168
+ def default_hadoop_config_path
169
+ File.expand_path('../../../config/hadoop-local.xml', __FILE__)
170
+ end
171
+
172
+ def s3
173
+ @s3 ||= AWS::S3.new
174
+ end
175
+
176
+ def emr
177
+ @emr ||= AWS::EMR.new(region: options[:aws_region])
178
+ end
179
+
180
+ def job_bucket
181
+ @job_bucket ||= s3.buckets[options[:job_bucket]]
182
+ end
183
+
184
+ def data_bucket
185
+ @data_bucket ||= s3.buckets[options[:data_bucket]]
186
+ end
187
+
188
+ def check_job!
189
+ raise Thor::Error, "No such job: #{job_config}" unless File.exists?("lib/#{job_config}.rb")
190
+ end
191
+
192
+ def relative_path(path)
193
+ path.sub(Dir.pwd + '/', '')
194
+ end
195
+
196
+ def check_local_output!(path)
197
+ if File.exists?(path)
198
+ raise Thor::Error, "#{options[:output]} already exists!"
199
+ end
200
+ end
201
+
202
+ def run_command(*args)
203
+ say_status(:running, 'Hadoop started')
204
+ Open3.popen3(*args) do |stdin, stdout, stderr, wait_thr|
205
+ stdin.close
206
+ stdout_printer = Thread.new(stdout) do |stdout|
207
+ while line = stdout.gets
208
+ say(line.chomp)
209
+ end
210
+ end
211
+ stderr_printer = Thread.new(stderr) do |stderr|
212
+ filter = HadoopStatusFilter.new(stderr, self, options.silent?)
213
+ filter.run
214
+ end
215
+ stdout_printer.join
216
+ stderr_printer.join
217
+ if wait_thr.value.exitstatus == 0
218
+ say_status(:done, 'Job completed')
219
+ else
220
+ say_status(:failed, 'Job failed', :red)
221
+ end
222
+ end
223
+ end
224
+
225
+ def print_job_flow_extended_status(job_flow)
226
+ id = job_flow.job_flow_id
227
+ state = job_flow.state
228
+ created_at = job_flow.created_at.strftime(ISO_DATE_TIME)
229
+ change_reason = job_flow.last_state_change_reason
230
+ say_status(:started, created_at)
231
+ say_status(:state, state)
232
+ say_status(:change, change_reason)
233
+ rescue => e
234
+ say_status(:error, e.message, :red)
235
+ sleep 1
236
+ retry
237
+ end
238
+
239
+ def print_job_flow_status(job_flow)
240
+ id = job_flow.job_flow_id
241
+ state = job_flow.state
242
+ created_at = job_flow.created_at.strftime(ISO_DATE_TIME)
243
+ change_reason = job_flow.last_state_change_reason
244
+ say_status(:status, sprintf('%-15s %-10s %19s %s', id, state, created_at, change_reason))
245
+ rescue => e
246
+ say_status(:error, e.message, :red)
247
+ sleep 1
248
+ retry
249
+ end
250
+
251
+ def config_file_options_with_defaults
252
+ @config_file_options_with_defaults ||= DEFAULTS.merge(options_from_config_file)
253
+ end
254
+
255
+ def options_from_config_file
256
+ @options_from_config_file ||= begin
257
+ ::YAML::load_file(".humboldt.yml")
258
+ rescue Errno::ENOENT
259
+ {}
260
+ end
261
+ end
262
+ end
263
+ end
@@ -0,0 +1,198 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ class EmrFlow
5
+ attr_reader :output_path
6
+
7
+ def initialize(*args)
8
+ @job_name, @input_glob, @package, @emr, @data_bucket, @job_bucket, @output_path = args
9
+ @output_path ||= "#{@package.project_name}/#{@job_name}/output"
10
+ end
11
+
12
+ def prepare!
13
+ upload_bootstrap_task_files!
14
+ upload_jar!
15
+ end
16
+
17
+ def cleanup!
18
+ delete_output_dir!
19
+ end
20
+
21
+ def run!(launch_options={})
22
+ check_jar!
23
+ check_output_dir!
24
+ create_flow!(launch_options)
25
+ end
26
+
27
+ def jar_path
28
+ "#{@package.project_name}/#{File.basename(@package.jar_path)}"
29
+ end
30
+
31
+ def jar_uri
32
+ s3_uri(jar_path)
33
+ end
34
+
35
+ def output_uri
36
+ s3_uri(output_path)
37
+ end
38
+
39
+ def log_path
40
+ "#{@package.project_name}/#{@job_name}/logs"
41
+ end
42
+
43
+ private
44
+
45
+ BOOTSTRAP_TASK_FILES = {
46
+ :remove_old_jruby => 'config/emr-bootstrap/remove_old_jruby.sh'
47
+ }.freeze
48
+
49
+ def s3_uri(path, options={})
50
+ protocol = options[:protocol] || 's3'
51
+ bucket = options[:bucket] || @job_bucket
52
+ "#{protocol}://#{bucket.name}/#{path}"
53
+ end
54
+
55
+ def upload_bootstrap_task_files!
56
+ BOOTSTRAP_TASK_FILES.values.each do |local_path|
57
+ remote_obj = @job_bucket.objects["#{@package.project_name}/#{local_path}"]
58
+ remote_obj.write(Pathname.new(File.expand_path(local_path, "#{__FILE__}/../../..")))
59
+ end
60
+ end
61
+
62
+ def upload_jar!
63
+ # TODO: upload only if not exists and MD5 != ETag
64
+ jar_obj = @job_bucket.objects[jar_path]
65
+ jar_obj.write(Pathname.new(@package.jar_path))
66
+ end
67
+
68
+ def check_jar!
69
+ unless @job_bucket.objects.with_prefix(jar_path).any?
70
+ raise "Job JAR missing (#{s3_uri(jar_path)}"
71
+ end
72
+ end
73
+
74
+ def check_output_dir!
75
+ if @job_bucket.objects.with_prefix(output_path).any?
76
+ raise "Output directory already exists (#{s3_uri(output_path)})"
77
+ end
78
+ end
79
+
80
+ def delete_output_dir!
81
+ @job_bucket.objects.with_prefix(output_path).delete_all
82
+ end
83
+
84
+ def job_flow_configuration(launch_options)
85
+ {
86
+ :log_uri => s3_uri(log_path),
87
+ :instances => instance_configuration(launch_options),
88
+ :steps => [step_configuration(launch_options)],
89
+ :bootstrap_actions => bootstrap_actions,
90
+ :visible_to_all_users => true
91
+ }
92
+ end
93
+
94
+ def instance_configuration(launch_options)
95
+ {
96
+ :ec2_key_name => launch_options[:ec2_key_name],
97
+ :hadoop_version => launch_options[:hadoop_version],
98
+ :instance_groups => InstanceGroupConfiguration.create(launch_options)
99
+ }
100
+ end
101
+
102
+ def bootstrap_actions
103
+ remove_old_jruby_action = {
104
+ :name => 'remove_old_jruby',
105
+ :script_bootstrap_action => {
106
+ :path => s3_uri("#{@package.project_name}/#{BOOTSTRAP_TASK_FILES[:remove_old_jruby]}")
107
+ }
108
+ }
109
+
110
+ # http://hadoop.apache.org/docs/r1.0.3/mapred-default.html
111
+ configure_hadoop_action = {
112
+ :name => 'configure_hadoop',
113
+ :script_bootstrap_action => {
114
+ :path => 's3://eu-west-1.elasticmapreduce/bootstrap-actions/configure-hadoop',
115
+ :args => [
116
+ '-m', 'mapred.job.reuse.jvm.num.tasks=-1',
117
+ '-m', 'mapred.map.tasks.speculative.execution=false',
118
+ '-m', 'mapred.reduce.tasks.speculative.execution=false'
119
+ ]
120
+ }
121
+ }
122
+
123
+ [remove_old_jruby_action, configure_hadoop_action]
124
+ end
125
+
126
+ def step_configuration(launch_options)
127
+ {
128
+ :name => @package.project_name,
129
+ :hadoop_jar_step => {
130
+ :jar => s3_uri(jar_path),
131
+ :args => [
132
+ @job_name,
133
+ s3_uri(@input_glob, protocol: 's3n', bucket: @data_bucket),
134
+ s3_uri(output_path, protocol: 's3n'),
135
+ *launch_options[:extra_hadoop_args]
136
+ ]
137
+ }
138
+ }
139
+ end
140
+
141
+ def create_flow!(launch_options)
142
+ job_flow = @emr.job_flows.create(@package.project_name, job_flow_configuration(launch_options))
143
+ end
144
+
145
+ module InstanceGroupConfiguration
146
+ extend self
147
+
148
+ # TODO: add 'task' group when support is added for 'tasks'
149
+ INSTANCE_GROUPS = %w[master core].freeze
150
+ MASTER_INSTANCE_TYPE = 'm1.small'.freeze
151
+ DEFAULT_CORE_INSTANCE_TYPE = 'c1.xlarge'.freeze
152
+ DEFAULT_BID_PRICE = '0.2'.freeze
153
+ DEFAULT_CORE_INSTANCE_COUNT = 4
154
+
155
+ INSTANCE_TYPE_MAPPINGS = {
156
+ 'master' => MASTER_INSTANCE_TYPE,
157
+ 'core' => DEFAULT_CORE_INSTANCE_TYPE
158
+ }.freeze
159
+
160
+ INSTANCE_COUNT_MAPPINGS = {
161
+ 'master' => 1,
162
+ 'core' => DEFAULT_CORE_INSTANCE_COUNT
163
+ }.freeze
164
+
165
+ def base_configuration(group)
166
+ {:name => "#{group.capitalize} Group", :instance_role => group.upcase}
167
+ end
168
+
169
+ def configure_type_and_count(group, configuration, options = {})
170
+ if group == 'core'
171
+ configuration[:instance_type] = options[:instance_type]
172
+ configuration[:instance_count] = options[:instance_count]
173
+ end
174
+
175
+ configuration[:instance_type] ||= INSTANCE_TYPE_MAPPINGS[group]
176
+ configuration[:instance_count] ||= INSTANCE_COUNT_MAPPINGS[group]
177
+ end
178
+
179
+ def configure_market(group, configuration, spot_instances, bid_price)
180
+ if spot_instances && (spot_instances.empty? || spot_instances.include?(group))
181
+ configuration[:market] = 'SPOT'
182
+ configuration[:bid_price] = bid_price || DEFAULT_BID_PRICE
183
+ else
184
+ configuration[:market] = 'ON_DEMAND'
185
+ end
186
+ end
187
+
188
+ def create(options)
189
+ INSTANCE_GROUPS.map do |group|
190
+ configuration = base_configuration(group)
191
+ configure_type_and_count(group, configuration, options)
192
+ configure_market(group, configuration, options[:spot_instances], options[:bid_price])
193
+ configuration
194
+ end
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,97 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ class HadoopStatusFilter
5
+ def initialize(hadoop_stderr, shell, silent)
6
+ @hadoop_stderr = hadoop_stderr
7
+ @shell = shell
8
+ @silent = silent
9
+ @counters = {}
10
+ end
11
+
12
+ def run
13
+ counter_group = nil
14
+ while line = @hadoop_stderr.gets
15
+ if @counters_printing && (hadoop_log?(line) || line =~ /^\t+/)
16
+ case line.chomp
17
+ when /(?:JobClient: |\t+)([^\t]+)=(\d+)$/
18
+ if counter_group
19
+ @counters[counter_group] ||= {}
20
+ @counters[counter_group][$1.strip] = $2.to_i
21
+ end
22
+ when /(?:JobClient: |\t+)([^\t]+)$/
23
+ counter_group = $1.strip
24
+ end
25
+ elsif @error_printing && !hadoop_log?(line) && !ignore?(line)
26
+ report_error(line)
27
+ elsif ignore?(line)
28
+ # do nothing
29
+ else
30
+ @counters_printing = false
31
+ @error_printing = false
32
+ case line
33
+ when /map (\d+)% reduce (\d+)%/
34
+ report_progress($1, $2)
35
+ when /Counters: \d+/
36
+ @counters_printing = true
37
+ else
38
+ unless hadoop_log?(line)
39
+ @error_printing = true
40
+ if line =~ /warning(!|:)/i
41
+ @error_type = :warning
42
+ else
43
+ @error_type = :error
44
+ end
45
+ report_error(line)
46
+ end
47
+ end
48
+ end
49
+ @shell.say(line.chomp, :red) unless @silent
50
+ end
51
+ print_counters_table
52
+ end
53
+
54
+ private
55
+
56
+ def hadoop_log?(line)
57
+ line =~ /(?:INFO|WARN) (?:mapred|input|output|util|jvm|mapreduce)\./
58
+ end
59
+
60
+ def ignore?(line)
61
+ case line
62
+ when /^\s*$/,
63
+ /Warning: \$HADOOP_HOME is deprecated/,
64
+ /Unable to load realm info from SCDynamicStore/,
65
+ /Unable to load native-hadoop library/,
66
+ /Snappy native library not loaded/,
67
+ /Configuration.deprecation:/,
68
+ /WARN conf.Configuration.*attempt to override final parameter.*ignoring/i
69
+ true
70
+ else
71
+ false
72
+ end
73
+ end
74
+
75
+ def report_progress(map, reduce)
76
+ @shell.say_status(:progress, "map #{map}%, reduce #{reduce}%")
77
+ end
78
+
79
+ def report_error(line)
80
+ @shell.say_status(@error_type, line.chomp, @error_type == :error ? :red : :yellow)
81
+ end
82
+
83
+ def print_counters_table
84
+ table = @counters.flat_map do |group, counters|
85
+ [
86
+ [group, *counters.first],
87
+ *counters.drop(1).map { |counter, value| ['', counter, value] },
88
+ ['', '', '']
89
+ ]
90
+ end
91
+ table.pop
92
+ @shell.say
93
+ @shell.print_table(table)
94
+ @shell.say
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,5 @@
1
+ module Humboldt
2
+ module JavaLib
3
+ include_package 'humboldt'
4
+ end
5
+ end
@@ -0,0 +1,15 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ class Mapper < Processor
5
+ class << self
6
+ def map(&block)
7
+ define_method(:map) do |key, value, context|
8
+ @in_key.hadoop = key
9
+ @in_value.hadoop = value
10
+ instance_exec(@in_key.ruby, @in_value.ruby, &block)
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ module Patterns
5
+ class SumReducer < Reducer
6
+ input :text, :long
7
+ output :text, :long
8
+
9
+ reduce do |key, values|
10
+ sum = 0
11
+ values.each { |v| sum += v }
12
+ emit(key, sum)
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: utf-8
2
+
3
+ require 'zlib'
4
+
5
+
6
+ module Humboldt
7
+ class BinaryPrefixPartitioner
8
+ def initialize(cutoff_index)
9
+ @cutoff_index = cutoff_index
10
+ end
11
+
12
+ def partition(key, value, num_partitions)
13
+ length = @cutoff_index > key.length ? key.length : @cutoff_index
14
+ prefix = String.from_java_bytes(key.bytes)[0, length]
15
+ Zlib.crc32(prefix) % num_partitions
16
+ end
17
+ end
18
+
19
+ class DropBinaryPrefixPartitioner < BinaryPrefixPartitioner
20
+ def partition(key, value, num_partitions)
21
+ length = key.length > @cutoff_index ? key.length - @cutoff_index : 0
22
+ prefix = String.from_java_bytes(key.bytes)[0, length]
23
+ Zlib.crc32(prefix) % num_partitions
24
+ end
25
+ end
26
+
27
+ class BinaryPrefixComparator
28
+ def initialize(cutoff_index)
29
+ @cutoff_index = cutoff_index
30
+ end
31
+
32
+ def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
33
+ subset_length1 = @cutoff_index > length1 ? length1 : @cutoff_index
34
+ subset_length2 = @cutoff_index > length2 ? length2 : @cutoff_index
35
+ ::Hadoop::Io::WritableComparator.compareBytes(bytes1, start1, subset_length1, bytes2, start2, subset_length2)
36
+ end
37
+ end
38
+
39
+ class DropBinaryPrefixComparator < BinaryPrefixComparator
40
+ def compare_raw(bytes1, start1, length1, bytes2, start2, length2)
41
+ subset_length1 = length1 - @cutoff_index
42
+ subset_length2 = length2 - @cutoff_index
43
+ ::Hadoop::Io::WritableComparator.compareBytes(bytes1, start1, subset_length1, bytes2, start2, subset_length2)
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,96 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ class Processor
5
+ class << self
6
+ def self.type_accessor(*names)
7
+ names.each do |name|
8
+ module_eval <<-EOA
9
+ def #{name}
10
+ @#{name} || superclass.#{name}
11
+ end
12
+ def #{name}=(type)
13
+ @#{name} = TypeConverter[type]
14
+ define_method(:#{name}_accessor) do
15
+ TypeConverter[type].new
16
+ end
17
+ end
18
+ EOA
19
+ end
20
+ end
21
+
22
+ type_accessor :input_key, :input_value, :output_key, :output_value
23
+
24
+ def input(*types)
25
+ self.input_key = types.first
26
+ self.input_value = types.last
27
+ end
28
+
29
+ def output(*types)
30
+ self.output_key = types.first
31
+ self.output_value = types.last
32
+ end
33
+
34
+ def setup(&block)
35
+ define_method(:instance_setup, &block)
36
+ private(:instance_setup)
37
+ end
38
+
39
+ def cleanup(&block)
40
+ define_method(:instance_cleanup, &block)
41
+ private(:instance_cleanup)
42
+ end
43
+ end
44
+
45
+ attr_reader :current_context
46
+
47
+ def setup(context)
48
+ @current_context = context
49
+ @in_key = input_key_accessor
50
+ @in_value = input_value_accessor
51
+ @out_key = output_key_accessor
52
+ @out_value = output_value_accessor
53
+ unless Hadoop::Mapreduce::Job.instance_methods.include?(:add_cache_file)
54
+ create_symlinks!
55
+ end
56
+ instance_setup
57
+ end
58
+
59
+ def cleanup(context)
60
+ instance_cleanup
61
+ end
62
+
63
+ protected
64
+
65
+ def emit(key, value)
66
+ @out_key.ruby = key
67
+ @out_value.ruby = value
68
+ @current_context.write(@out_key.hadoop, @out_value.hadoop)
69
+ end
70
+
71
+ private
72
+
73
+ def instance_setup
74
+ end
75
+
76
+ def instance_cleanup
77
+ end
78
+
79
+ def create_symlinks!
80
+ distributed_cache = ::Hadoop::FileCache::DistributedCache
81
+ files = distributed_cache.get_cache_files(@current_context.configuration)
82
+ local_files = distributed_cache.get_local_cache_files(@current_context.configuration)
83
+ if files && local_files
84
+ work_dir = ENV['HADOOP_WORK_DIR']
85
+ files.each_with_index do |file, i|
86
+ target = local_files[i].to_s
87
+ link_path = File.join(work_dir, file.fragment)
88
+ FileUtils.mkdir_p(File.dirname(link_path))
89
+ unless File.exists?(link_path)
90
+ FileUtils.ln_s(target, link_path)
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ class Reducer < Processor
5
+ class << self
6
+ def reduce(&block)
7
+ define_method(:reduce) do |key, values, context|
8
+ @in_key.hadoop = key
9
+ values_enumerator = TypeConversionEnumerator.new(@in_value, values.iterator)
10
+ instance_exec(@in_key.ruby, values_enumerator, &block)
11
+ end
12
+ end
13
+ end
14
+
15
+ class TypeConversionEnumerator < Enumerator
16
+ def initialize(*args)
17
+ @value_converter, @hadoop_iterator = args
18
+ end
19
+
20
+ def each
21
+ while @hadoop_iterator.has_next
22
+ @value_converter.hadoop = @hadoop_iterator.next
23
+ yield @value_converter.ruby
24
+ end
25
+ end
26
+
27
+ def next
28
+ raise StopIteration unless @hadoop_iterator.has_next
29
+ @value_converter.hadoop = @hadoop_iterator.next
30
+ @value_converter.ruby
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,100 @@
1
+ # encoding: utf-8
2
+
3
+ require 'humboldt'
4
+
5
+
6
+ module RunnerHelpers
7
+ def run_nokey_mapper(mapper, *values, &context_callback)
8
+ key = mapper.input_key_accessor.ruby
9
+ args = values.map { |value| [key, value] }
10
+ run_mapper(mapper, *args, &context_callback)
11
+ end
12
+
13
+ def run_mapper(mapper, *entries, &context_callback)
14
+ in_value = mapper.input_value_accessor
15
+ run(mapper, :map, context_callback, *entries) do |value|
16
+ in_value.ruby = value
17
+ in_value.hadoop
18
+ end
19
+ end
20
+
21
+ def run_reducer(reducer, *entries, &context_callback)
22
+ run(reducer, :reduce, context_callback, *entries) do |value|
23
+ fake_iterator(*value.map do |v|
24
+ in_value = reducer.input_value_accessor
25
+ in_value.ruby = v
26
+ in_value.hadoop
27
+ end)
28
+ end
29
+ end
30
+
31
+ def run(runner, method, context_callback, *entries)
32
+ in_key = runner.input_key_accessor
33
+ context = FakeContext.new(runner.output_key_accessor, runner.output_value_accessor)
34
+ context_callback.call(context) if context_callback
35
+ runner.setup(context)
36
+ entries.each do |entry|
37
+ in_key.ruby = entry.first
38
+ runner.send(method, in_key.hadoop, yield(entry.last), context)
39
+ end
40
+ runner.cleanup(context)
41
+ context.results
42
+ end
43
+
44
+ def fake_iterator(*values)
45
+ FakeIterable.new(values)
46
+ end
47
+
48
+ class FakeIterable
49
+ def initialize(values)
50
+ @values = values
51
+ end
52
+ def iterator
53
+ FakeIterator.new(@values.dup)
54
+ end
55
+ end
56
+
57
+ class FakeIterator
58
+ def initialize(values)
59
+ @values = values
60
+ end
61
+ def has_next
62
+ !@values.empty?
63
+ end
64
+ def next
65
+ @values.shift
66
+ end
67
+ end
68
+
69
+ class FakeContext
70
+ attr_reader :results, :counters
71
+
72
+ def initialize(key_accessor, value_accessor)
73
+ @key_accessor, @value_accessor = key_accessor, value_accessor
74
+ @results = []
75
+ @counters = Hash.new { |h,k| h[k] = Hash.new { |h2,k2| h2[k2] = 0 } }
76
+ end
77
+
78
+ def write(key, value)
79
+ @key_accessor.hadoop = key
80
+ @value_accessor.hadoop = value
81
+ @results << [@key_accessor.ruby, @value_accessor.ruby]
82
+ end
83
+
84
+ def configuration
85
+ @configuration ||= ::Hadoop::Conf::Configuration.new.tap do |config|
86
+ config.set 'mapred.job.tracker', 'local'
87
+ end
88
+ end
89
+
90
+ def get_counter(group, name)
91
+ FakeCounter.new do |amount|
92
+ @counters[group][name] += amount
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+ RSpec.configure do |conf|
99
+ conf.include(RunnerHelpers)
100
+ end
@@ -0,0 +1,180 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ module TypeConverter
5
+ class Binary
6
+ HADOOP = ::Hadoop::Io::BytesWritable
7
+ RUBY = ::String
8
+
9
+ attr_reader :hadoop
10
+
11
+ def hadoop=(value)
12
+ unless value.is_a?(HADOOP)
13
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
14
+ end
15
+ @hadoop = value
16
+ end
17
+
18
+ def initialize
19
+ @hadoop = HADOOP.new
20
+ end
21
+
22
+ def ruby
23
+ String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length)
24
+ end
25
+
26
+ def ruby=(value)
27
+ unless value.is_a?(RUBY)
28
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
29
+ end
30
+
31
+ @hadoop.set(value.to_java_bytes, 0, value.bytesize)
32
+ end
33
+ end
34
+
35
+ begin
36
+ require 'msgpack'
37
+
38
+ class Encoded < Binary
39
+ def ruby=(value)
40
+ unless value.is_a?(Hash) || value.is_a?(Array)
41
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected Hash or Array"
42
+ end
43
+ packed = MessagePack.pack(value)
44
+ @hadoop.set(packed.to_java_bytes, 0, packed.bytesize)
45
+ end
46
+
47
+ def ruby
48
+ packed = String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length)
49
+ MessagePack.unpack(packed, encoding: Encoding::UTF_8)
50
+ end
51
+ end
52
+ rescue LoadError
53
+ end
54
+
55
+ class Text
56
+ HADOOP = ::Hadoop::Io::Text
57
+ RUBY = ::String
58
+
59
+ attr_reader :hadoop
60
+
61
+ def hadoop=(value)
62
+ unless value.is_a?(HADOOP)
63
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
64
+ end
65
+ @hadoop = value
66
+ end
67
+
68
+ def initialize
69
+ @hadoop = HADOOP.new
70
+ end
71
+
72
+ def ruby
73
+ String.from_java_bytes(@hadoop.bytes).byteslice(0, @hadoop.length).force_encoding(Encoding::UTF_8)
74
+ end
75
+
76
+ def ruby=(value)
77
+ unless value.is_a?(RUBY)
78
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
79
+ end
80
+
81
+ if value.encoding == Encoding::UTF_8
82
+ @hadoop.set(value.to_java_bytes, 0, value.bytesize)
83
+ else
84
+ @hadoop.set(value)
85
+ end
86
+ end
87
+ end
88
+
89
+ begin
90
+ require 'json'
91
+
92
+ class Json < Text
93
+ def ruby=(value)
94
+ unless value.is_a?(Hash) || value.is_a?(Array)
95
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected Hash or Array"
96
+ end
97
+ @hadoop.set(JSON.generate(value))
98
+ end
99
+
100
+ def ruby
101
+ JSON.parse(hadoop.to_s)
102
+ end
103
+ end
104
+ end
105
+
106
+ class Long
107
+ HADOOP = ::Hadoop::Io::LongWritable
108
+ RUBY = ::Integer
109
+
110
+ attr_reader :hadoop
111
+
112
+ def hadoop=(value)
113
+ unless value.is_a?(HADOOP)
114
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
115
+ end
116
+ @hadoop = value
117
+ end
118
+
119
+ def initialize
120
+ @hadoop = HADOOP.new
121
+ end
122
+
123
+ def ruby
124
+ @hadoop.get
125
+ end
126
+
127
+ def ruby=(value)
128
+ unless value.is_a?(Integer)
129
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
130
+ end
131
+
132
+ @hadoop.set value
133
+ end
134
+ end
135
+
136
+ class None
137
+ HADOOP = ::Hadoop::Io::NullWritable
138
+ RUBY = ::NilClass
139
+
140
+ def hadoop
141
+ HADOOP.get
142
+ end
143
+
144
+ def hadoop=(value)
145
+ unless value.is_a?(HADOOP)
146
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{HADOOP}"
147
+ end
148
+ end
149
+
150
+ def ruby
151
+ nil
152
+ end
153
+
154
+ def ruby=(value)
155
+ unless value.nil?
156
+ raise ArgumentError, "Hadoop type mismatch, was #{value.class}, expected #{RUBY}"
157
+ end
158
+ end
159
+ end
160
+
161
+ TYPE_CONVERTER_CLASS_CACHE = Hash.new { |h,k| h[k] = const_get(k.to_s.capitalize) }
162
+
163
+ def self.[](name)
164
+ TYPE_CONVERTER_CLASS_CACHE[name]
165
+ end
166
+
167
+ FROM_HADOOP_MAPPINGS = {
168
+ ::Hadoop::Io::Text => Text,
169
+ ::Hadoop::Io::BytesWritable => Binary,
170
+ ::Hadoop::Io::LongWritable => Long,
171
+ ::Hadoop::Io::NullWritable => None
172
+ }.freeze
173
+
174
+ def self.from_hadoop(hadoop_class)
175
+ accessor = FROM_HADOOP_MAPPINGS[hadoop_class]
176
+ raise ArgumentError, "Unsupported Hadoop type: #{hadoop_class}" unless accessor
177
+ accessor
178
+ end
179
+ end
180
+ end
@@ -0,0 +1,5 @@
1
+ # encoding: utf-8
2
+
3
+ module Humboldt
4
+ VERSION = '1.0.0'.freeze
5
+ end
data/lib/humboldt.jar ADDED
Binary file
data/lib/humboldt.rb ADDED
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+
3
+ require 'fileutils'
4
+ require 'rubydoop'
5
+ require 'hadoop'
6
+
7
+ require 'humboldt/java_lib'
8
+
9
+ require 'ext/hadoop'
10
+ require 'ext/rubydoop'
11
+
12
+ require 'humboldt/type_converters'
13
+ require 'humboldt/processor'
14
+ require 'humboldt/mapper'
15
+ require 'humboldt/reducer'
16
+ require 'humboldt/prefix_grouping'
metadata ADDED
@@ -0,0 +1,112 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: humboldt
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: java
6
+ authors:
7
+ - The Burt Platform Team
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - '>='
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ name: thor
20
+ prerelease: false
21
+ type: :runtime
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 1.1.2
33
+ name: rubydoop
34
+ prerelease: false
35
+ type: :runtime
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 1.1.2
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - '>='
45
+ - !ruby/object:Gem::Version
46
+ version: 1.16.0
47
+ - - <
48
+ - !ruby/object:Gem::Version
49
+ version: 1.33.0
50
+ name: aws-sdk
51
+ prerelease: false
52
+ type: :runtime
53
+ version_requirements: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - '>='
56
+ - !ruby/object:Gem::Version
57
+ version: 1.16.0
58
+ - - <
59
+ - !ruby/object:Gem::Version
60
+ version: 1.33.0
61
+ description: Humboldt provides a mapreduce API abstraction built on top of Rubydoop, and tools to run Hadoop jobs effortlessly both locally and on Amazon EMR
62
+ email:
63
+ - theo@burtcorp.com
64
+ executables:
65
+ - humboldt
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - bin/humboldt
70
+ - config/emr-bootstrap/remove_old_jruby.sh
71
+ - config/hadoop-local.xml
72
+ - lib/ext/hadoop.rb
73
+ - lib/ext/rubydoop.rb
74
+ - lib/humboldt.jar
75
+ - lib/humboldt.rb
76
+ - lib/humboldt/cli.rb
77
+ - lib/humboldt/emr_flow.rb
78
+ - lib/humboldt/hadoop_status_filter.rb
79
+ - lib/humboldt/java_lib.rb
80
+ - lib/humboldt/mapper.rb
81
+ - lib/humboldt/patterns/sum_reducer.rb
82
+ - lib/humboldt/prefix_grouping.rb
83
+ - lib/humboldt/processor.rb
84
+ - lib/humboldt/reducer.rb
85
+ - lib/humboldt/rspec.rb
86
+ - lib/humboldt/type_converters.rb
87
+ - lib/humboldt/version.rb
88
+ homepage: http://github.com/burtcorp/humboldt
89
+ licenses:
90
+ - BSD-3-Clause
91
+ metadata: {}
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - '>='
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - '>='
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubyforge_project:
108
+ rubygems_version: 2.2.2
109
+ signing_key:
110
+ specification_version: 4
111
+ summary: Tools and libraries for simplifying running Rubydoop jobs locally and on AWS Elastic MapReduce
112
+ test_files: []