wukong-hadoop 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/.gitignore +59 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +3 -0
  4. data/README.md +339 -0
  5. data/Rakefile +13 -0
  6. data/bin/hdp-bin +44 -0
  7. data/bin/hdp-bzip +23 -0
  8. data/bin/hdp-cat +3 -0
  9. data/bin/hdp-catd +3 -0
  10. data/bin/hdp-cp +3 -0
  11. data/bin/hdp-du +86 -0
  12. data/bin/hdp-get +3 -0
  13. data/bin/hdp-kill +3 -0
  14. data/bin/hdp-kill-task +3 -0
  15. data/bin/hdp-ls +11 -0
  16. data/bin/hdp-mkdir +2 -0
  17. data/bin/hdp-mkdirp +12 -0
  18. data/bin/hdp-mv +3 -0
  19. data/bin/hdp-parts_to_keys.rb +77 -0
  20. data/bin/hdp-ps +3 -0
  21. data/bin/hdp-put +3 -0
  22. data/bin/hdp-rm +32 -0
  23. data/bin/hdp-sort +40 -0
  24. data/bin/hdp-stream +40 -0
  25. data/bin/hdp-stream-flat +22 -0
  26. data/bin/hdp-stream2 +39 -0
  27. data/bin/hdp-sync +17 -0
  28. data/bin/hdp-wc +67 -0
  29. data/bin/wu-hadoop +14 -0
  30. data/examples/counter.rb +17 -0
  31. data/examples/map_only.rb +28 -0
  32. data/examples/processors.rb +4 -0
  33. data/examples/sonnet_18.txt +14 -0
  34. data/examples/tokenizer.rb +28 -0
  35. data/examples/word_count.rb +44 -0
  36. data/features/step_definitions/wu_hadoop_steps.rb +4 -0
  37. data/features/support/env.rb +1 -0
  38. data/features/wu_hadoop.feature +113 -0
  39. data/lib/wukong-hadoop.rb +21 -0
  40. data/lib/wukong-hadoop/configuration.rb +133 -0
  41. data/lib/wukong-hadoop/driver.rb +190 -0
  42. data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
  43. data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
  44. data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
  45. data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
  46. data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
  47. data/lib/wukong-hadoop/extensions.rb +2 -0
  48. data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
  49. data/lib/wukong-hadoop/version.rb +6 -0
  50. data/spec/spec_helper.rb +21 -0
  51. data/spec/support/driver_helper.rb +15 -0
  52. data/spec/support/integration_helper.rb +39 -0
  53. data/spec/wukong-hadoop/driver_spec.rb +117 -0
  54. data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
  55. data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
  56. data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
  57. data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
  58. data/wukong-hadoop.gemspec +33 -0
  59. metadata +168 -0
@@ -0,0 +1,190 @@
1
+ require 'shellwords'
2
+ require_relative("driver/inputs_and_outputs")
3
+ require_relative("driver/map_logic")
4
+ require_relative("driver/reduce_logic")
5
+ require_relative("driver/local_invocation")
6
+ require_relative("driver/hadoop_invocation")
7
+
8
+ module Wukong
9
+ module Hadoop
10
+
11
+ # The <tt>Hadoop::Driver</tt> class contains the logic to examine
12
+ # arguments and construct command lines which it will execute to
13
+ # create the desired behavior.
14
+ #
15
+ # The Hadoop::Driver will introspect on its arguments to guess (if
16
+ # not given) the processors to use as mapper and reducer in a
17
+ # map/reduce job. It will also decide whether to run that job in
18
+ # local or Hadoop mode. These decisions result in a command which
19
+ # it will ultimately execute.
20
+ class Driver < Wukong::Driver
21
+
22
+ include InputsAndOutputs
23
+ include MapLogic
24
+ include ReduceLogic
25
+ include HadoopInvocation
26
+ include LocalInvocation
27
+
28
+ # The settings used by this driver.
29
+ #
30
+ # @param [Configliere::Param]
31
+ attr_accessor :settings
32
+
33
+ # The (processed) arguments for this driver.
34
+ #
35
+ # @param [Array<String, Pathname>]
36
+ attr_reader :args
37
+
38
+ # Initialize and run a new Wukong::Hadoop::Driver for the given
39
+ # +settings+.
40
+ #
41
+ # Will rescue all Wukong::Error exceptions by printing a nice
42
+ # message to STDERR and exiting.
43
+ #
44
+ # @param [Configliere::Param] settings
45
+ # @param [Array<String>] extra_args
46
+ def self.run(settings, *extra_args)
47
+ begin
48
+ new(settings, *extra_args).run!
49
+ rescue Wukong::Error => e
50
+ $stderr.puts e.message
51
+ exit(127)
52
+ end
53
+ end
54
+
55
+ # Run this driver.
56
+ def run!
57
+ if mode == :local
58
+ # Log.info "Launching local!"
59
+ execute_command!(local_commandline)
60
+ else
61
+ ensure_input_and_output!
62
+ remove_output_path! if settings[:rm] || settings[:overwrite]
63
+ Log.info "Launching Hadoop!"
64
+ execute_command!(hadoop_commandline)
65
+ end
66
+ end
67
+
68
+ # Initialize a new driver with the given +settings+ and +args+.
69
+ #
70
+ # @param [Configliere::Param] settings
71
+ # @param [Array<String>] args
72
+ def initialize(settings, *args)
73
+ @settings = settings
74
+ self.args = args
75
+ end
76
+
77
+ # Set the +args+ for this driver.
78
+ #
79
+ # Arguments can be either (registered) processor names or files.
80
+ #
81
+ # An error will be raised on missing files or those which
82
+ # couldn't be loaded.
83
+ #
84
+ # An error will be raised if more than two arguments (mapper and
85
+ # reducer) are passed.
86
+ #
87
+ # @param [Array<String>] args
88
+ def args= args
89
+ raise Error.new("Cannot provide more than two arguments") if args.length > 2
90
+ @args = args.map do |arg|
91
+ if processor_registered?(arg)
92
+ arg
93
+ else
94
+ begin
95
+ rp = Pathname.new(arg).realpath
96
+ load rp
97
+ rp
98
+ rescue => e
99
+ raise Error.new("No such processor or file: #{arg}")
100
+ end
101
+ end
102
+ end
103
+ end
104
+
105
+ # What mode is this driver in?
106
+ #
107
+ # @return [:hadoop, :local]
108
+ def mode
109
+ settings[:mode].to_s == 'local' ? :local : :hadoop
110
+ end
111
+
112
+ # Were mapper and/or reducer named by a single argument?
113
+ #
114
+ # @return [true, false]
115
+ def single_job_arg?
116
+ args.size == 1
117
+ end
118
+
119
+ # Were mapper and/or reducer named by separate arguments?
120
+ #
121
+ # @return [true, false]
122
+ def separate_map_and_reduce_args?
123
+ args.size == 2
124
+ end
125
+
126
+ # Is there a processor registered with the given +name+?
127
+ #
128
+ # @param [#to_s] name
129
+ # @return [true, false]
130
+ def processor_registered? name
131
+ Wukong.registry.registered?(name.to_s.to_sym)
132
+ end
133
+
134
+ # Return the guessed name of a processor at the given +path+.
135
+ #
136
+ # @param [String] path
137
+ # @return [String]
138
+ def processor_name_from_file(path)
139
+ File.basename(path, '.rb')
140
+ end
141
+
142
+ # Does the given +path+ contain a processor named after itself?
143
+ #
144
+ # @param [String] path
145
+ # @return [true, false]
146
+ def file_is_processor?(path)
147
+ processor_registered?(processor_name_from_file(path))
148
+ end
149
+
150
+ # The prefix to insert befor all invocations of the
151
+ # <tt>wu-local</tt> runner.
152
+ #
153
+ # @return [String]
154
+ def command_prefix
155
+ settings[:command_prefix]
156
+ end
157
+
158
+ # Returns parameters to pass to an invocation of
159
+ # <tt>wu-local</tt>.
160
+ #
161
+ # Parameters like <tt>--reduce_tasks</tt> which are relevant to
162
+ # Wukong-Hadoop will be interpreted and *not* passed. Others
163
+ # will be passed unmodified.
164
+ #
165
+ # @return [String]
166
+ def params_to_pass
167
+ s = (Wukong.loaded_deploy_pack? ? Deploy.pre_deploy_settings : settings)
168
+ s.reject{ |param, val| s.definition_of(param, :wukong_hadoop) }.map{ |param,val| "--#{param}=#{Shellwords.escape(val.to_s)}" }.join(" ")
169
+ end
170
+
171
+ # Execute a command composed of the given parts.
172
+ #
173
+ # Will print the command instead of the <tt>--dry_run</tt>
174
+ # option was given.
175
+ #
176
+ # @param [Array<String>] args
177
+ def execute_command!(*args)
178
+ command = args.flatten.reject(&:blank?).join(" \\\n ")
179
+ if settings[:dry_run]
180
+ Log.info("Dry run:")
181
+ puts command
182
+ else
183
+ puts `#{command}`
184
+ raise "Streaming command failed!" unless $?.success?
185
+ end
186
+ end
187
+
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,184 @@
1
+ module Wukong
2
+ module Hadoop
3
+
4
+ # Provides methods for executing a map/reduce job on a Hadoop
5
+ # cluster via {Hadoop
6
+ # streaming}[http://hadoop.apache.org/docs/r0.15.2/streaming.html].
7
+ module HadoopInvocation
8
+
9
+ # Raise an error unless we have input and output.
10
+ def ensure_input_and_output!
11
+ raise Error.new("Explicit --input and --output paths are required to run a job in Hadoop mode.") if input_paths.nil? || input_paths.empty? || output_path.nil? || output_path.empty?
12
+ end
13
+
14
+ # Remove the output path.
15
+ #
16
+ # Will not actually do anything if the <tt>--dry_run</tt> option
17
+ # is also given.
18
+ def remove_output_path!
19
+ cmd = %Q{#{settings[:hadoop_runner]} fs -rmr '#{output_path}'}
20
+ Log.info "Removing output file #{output_path}: #{cmd}"
21
+ puts `#{cmd}` unless settings[:dry_run]
22
+ end
23
+
24
+ # Return the Hadoop command used to launch this job in a Hadoop
25
+ # cluster.
26
+ #
27
+ # You should be able to copy, paste, and run this command
28
+ # unmodified when debugging.
29
+ #
30
+ # @return [String]
31
+ def hadoop_commandline
32
+ [
33
+ hadoop_runner,
34
+ "jar #{settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
35
+ hadoop_jobconf_options,
36
+ "-D mapred.job.name='#{job_name}'",
37
+ hadoop_other_args,
38
+ "-mapper '#{mapper_commandline}'",
39
+ "-reducer '#{reducer_commandline}'",
40
+ "-input '#{input_paths}'",
41
+ "-output '#{output_path}'",
42
+ hadoop_files,
43
+ io_formats,
44
+ hadoop_recycle_env,
45
+ ].flatten.compact.join(" \t\\\n ")
46
+ end
47
+
48
+ # The job name that will be passed to Hadoop.
49
+ #
50
+ # Respects the <tt>--job_name</tt> option if given, otherwise
51
+ # constructs one from the given processors, input, and output
52
+ # paths.
53
+ #
54
+ # @return [String]
55
+ def job_name
56
+ return settings[:job_name] if settings[:job_name]
57
+ relevant_filename = args.compact.uniq.map { |path| File.basename(path, '.rb') }.join('-')
58
+ "#{relevant_filename}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
59
+ end
60
+
61
+ # The input format to use.
62
+ #
63
+ # Respects the value of <tt>--input_format</tt>.
64
+ #
65
+ # @return [String]
66
+ def input_format
67
+ settings[:input_format]
68
+ end
69
+
70
+ # The output format to use.
71
+ #
72
+ # Respects the value of <tt>--output_format</tt>.
73
+ #
74
+ # @return [String]
75
+ def output_format
76
+ settings[:output_format]
77
+ end
78
+
79
+ # :nodoc:
80
+ def io_formats
81
+ input = "-inputformat '#{input_format}'" if input_format
82
+ output = "-outputformat '#{output_format}'" if output_format
83
+ [input, output]
84
+ end
85
+
86
+ # The name of the Hadoop binary to use.
87
+ #
88
+ # Respects the value of <tt>--hadoop_runner</tt> if given.
89
+ #
90
+ # @return [String]
91
+ def hadoop_runner
92
+ settings[:hadoop_runner] || File.join(settings[:hadoop_home], 'bin/hadoop')
93
+ end
94
+
95
+ # Return an array of jobconf (-D) options that will be passed to Hadoop.
96
+ #
97
+ # Translates the "friendly" <tt>wu-hadoop</tt> names into the
98
+ # less-friendly Hadoop names.
99
+ #
100
+ # @return [Array<String>]
101
+ def hadoop_jobconf_options
102
+ jobconf_options = []
103
+ settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
104
+ settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
105
+ # If no reducer and no reduce_command, then skip the reduce phase
106
+ settings[:reduce_tasks] = 0 unless (reduce? || settings[:reduce_tasks].nil?)
107
+ # Fields hadoop should use to distribute records to reducers
108
+ unless settings[:partition_fields].blank?
109
+ jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
110
+ end
111
+ jobconf_options += [
112
+ :io_sort_mb, :io_sort_record_percent,
113
+ :map_speculative, :map_tasks,
114
+ :max_maps_per_cluster, :max_maps_per_node,
115
+ :max_node_map_tasks, :max_node_reduce_tasks,
116
+ :max_reduces_per_cluster, :max_reduces_per_node,
117
+ :max_record_length, :min_split_size,
118
+ :output_field_separator, :key_field_separator,
119
+ :partition_fields, :sort_fields,
120
+ :reduce_tasks, :respect_exit_status,
121
+ :reuse_jvms, :timeout,
122
+ :max_tracker_failures, :max_map_attempts,
123
+ :max_reduce_attempts
124
+ ].map do |opt|
125
+ defn = settings.definition_of(opt, :description)
126
+ val = settings[opt]
127
+ java_opt(defn, val)
128
+ end
129
+ jobconf_options.flatten.compact
130
+ end
131
+
132
+ # Returns other arguments used by Hadoop streaming.
133
+ #
134
+ # @return [String]
135
+ def hadoop_other_args
136
+ extra_str_args = parsed_java_opts
137
+ if settings[:split_on_xml_tag]
138
+ extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{options.split_on_xml_tag}>,end=</#{options.split_on_xml_tag}>'}
139
+ end
140
+ extra_str_args << ' -lazyOutput' if settings[:noempty] # don't create reduce file if no records
141
+ extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless settings[:partition_fields].blank?
142
+ extra_str_args
143
+ end
144
+
145
+ # :nodoc:
146
+ #
147
+ # http://hadoop.apache.org/docs/r0.20.2/streaming.html#Package+Files+With+Job+Submissions
148
+ def hadoop_files
149
+ args.find_all { |arg| arg.to_s =~ /\.rb$/ }.map do |arg|
150
+ "-file '#{arg}'"
151
+ end
152
+ end
153
+
154
+ # :nodoc:
155
+ def ruby_interpreter_path
156
+ Pathname.new(File.join(Config::CONFIG['bindir'], Config::CONFIG['RUBY_INSTALL_NAME'] + Config::CONFIG['EXEEXT'])).realpath
157
+ end
158
+
159
+ # :nodoc:
160
+ def use_alternative_gemfile
161
+ ENV['BUNDLE_GEMFILE'] = settings[:gemfile]
162
+ end
163
+
164
+ # :nodoc:
165
+ def hadoop_recycle_env
166
+ use_alternative_gemfile if settings[:gemfile]
167
+ %w[BUNDLE_GEMFILE].map{ |var| %Q{-cmdenv '#{var}=#{ENV[var]}'} if ENV[var] }.compact
168
+ end
169
+
170
+ # :nodoc:
171
+ def parsed_java_opts
172
+ settings[:java_opts].map do |java_opt|
173
+ java_opt.split('-D').reject{ |opt| opt.blank? }.map{ |opt| '-D ' + opt.strip }
174
+ end.flatten
175
+ end
176
+
177
+ # :nodoc:
178
+ def java_opt option, value
179
+ "-D %s=%s" % [option, Shellwords.escape(value.to_s)] if value
180
+ end
181
+
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,27 @@
1
+ module Wukong
2
+ module Hadoop
3
+
4
+ # Provides methods for determining input and output paths.
5
+ # Written as a separate module to allow easy overriding from other
6
+ # plugins.
7
+ module InputsAndOutputs
8
+
9
+ # The input paths to read from.
10
+ #
11
+ # @return [String]
12
+ def input_paths
13
+ (settings[:input] || [])
14
+ end
15
+
16
+ # The output path to write to.
17
+ #
18
+ # @return [String]
19
+ def output_path
20
+ settings[:output]
21
+ end
22
+
23
+ end
24
+ end
25
+ end
26
+
27
+
@@ -0,0 +1,48 @@
1
+ require 'shellwords'
2
+ module Wukong
3
+ module Hadoop
4
+
5
+ # Provides methods for executing a map/reduce job locally on the
6
+ # command-line.
7
+ module LocalInvocation
8
+
9
+ # Returns the full local command used by Wukong-Hadoop when
10
+ # simulating a map/reduce job on the command-line.
11
+ #
12
+ # You should be able to run this commmand directly to simulate
13
+ # the job yourself.
14
+ #
15
+ # @return [String]
16
+ def local_commandline
17
+ [
18
+ [cat_input, mapper_commandline].tap do |pipeline|
19
+ pipeline.concat([sort_commandline, reducer_commandline]) if reduce?
20
+ end.flatten.compact.join(' | '),
21
+ cat_output
22
+ ].flatten.compact.join(' ')
23
+ end
24
+
25
+ # Returns the sort command used by Wukong-Hadoop when simulating
26
+ # a map/reduce job on the command-line.
27
+ #
28
+ # @return [String]
29
+ def sort_commandline
30
+ settings[:sort_command]
31
+ end
32
+
33
+ # :nodoc:
34
+ def cat_input
35
+ return unless input_paths && (!input_paths.empty?)
36
+ paths = Shellwords.join(input_paths.split(','))
37
+ "cat #{paths}"
38
+ end
39
+
40
+ # :nodoc:
41
+ def cat_output
42
+ return unless output_path
43
+ "> #{output_path}"
44
+ end
45
+
46
+ end
47
+ end
48
+ end