wukong-hadoop 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/.gitignore +59 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +3 -0
  4. data/README.md +339 -0
  5. data/Rakefile +13 -0
  6. data/bin/hdp-bin +44 -0
  7. data/bin/hdp-bzip +23 -0
  8. data/bin/hdp-cat +3 -0
  9. data/bin/hdp-catd +3 -0
  10. data/bin/hdp-cp +3 -0
  11. data/bin/hdp-du +86 -0
  12. data/bin/hdp-get +3 -0
  13. data/bin/hdp-kill +3 -0
  14. data/bin/hdp-kill-task +3 -0
  15. data/bin/hdp-ls +11 -0
  16. data/bin/hdp-mkdir +2 -0
  17. data/bin/hdp-mkdirp +12 -0
  18. data/bin/hdp-mv +3 -0
  19. data/bin/hdp-parts_to_keys.rb +77 -0
  20. data/bin/hdp-ps +3 -0
  21. data/bin/hdp-put +3 -0
  22. data/bin/hdp-rm +32 -0
  23. data/bin/hdp-sort +40 -0
  24. data/bin/hdp-stream +40 -0
  25. data/bin/hdp-stream-flat +22 -0
  26. data/bin/hdp-stream2 +39 -0
  27. data/bin/hdp-sync +17 -0
  28. data/bin/hdp-wc +67 -0
  29. data/bin/wu-hadoop +14 -0
  30. data/examples/counter.rb +17 -0
  31. data/examples/map_only.rb +28 -0
  32. data/examples/processors.rb +4 -0
  33. data/examples/sonnet_18.txt +14 -0
  34. data/examples/tokenizer.rb +28 -0
  35. data/examples/word_count.rb +44 -0
  36. data/features/step_definitions/wu_hadoop_steps.rb +4 -0
  37. data/features/support/env.rb +1 -0
  38. data/features/wu_hadoop.feature +113 -0
  39. data/lib/wukong-hadoop.rb +21 -0
  40. data/lib/wukong-hadoop/configuration.rb +133 -0
  41. data/lib/wukong-hadoop/driver.rb +190 -0
  42. data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
  43. data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
  44. data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
  45. data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
  46. data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
  47. data/lib/wukong-hadoop/extensions.rb +2 -0
  48. data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
  49. data/lib/wukong-hadoop/version.rb +6 -0
  50. data/spec/spec_helper.rb +21 -0
  51. data/spec/support/driver_helper.rb +15 -0
  52. data/spec/support/integration_helper.rb +39 -0
  53. data/spec/wukong-hadoop/driver_spec.rb +117 -0
  54. data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
  55. data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
  56. data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
  57. data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
  58. data/wukong-hadoop.gemspec +33 -0
  59. metadata +168 -0
@@ -0,0 +1,190 @@
1
+ require 'shellwords'
2
+ require_relative("driver/inputs_and_outputs")
3
+ require_relative("driver/map_logic")
4
+ require_relative("driver/reduce_logic")
5
+ require_relative("driver/local_invocation")
6
+ require_relative("driver/hadoop_invocation")
7
+
8
+ module Wukong
9
+ module Hadoop
10
+
11
+ # The <tt>Hadoop::Driver</tt> class contains the logic to examine
12
+ # arguments and construct command lines which it will execute to
13
+ # create the desired behavior.
14
+ #
15
+ # The Hadoop::Driver will introspect on its arguments to guess (if
16
+ # not given) the processors to use as mapper and reducer in a
17
+ # map/reduce job. It will also decide whether to run that job in
18
+ # local or Hadoop mode. These decisions result in a command which
19
+ # it will ultimately execute.
20
+ class Driver < Wukong::Driver
21
+
22
+ include InputsAndOutputs
23
+ include MapLogic
24
+ include ReduceLogic
25
+ include HadoopInvocation
26
+ include LocalInvocation
27
+
28
+ # The settings used by this driver.
29
+ #
30
+ # @param [Configliere::Param]
31
+ attr_accessor :settings
32
+
33
+ # The (processed) arguments for this driver.
34
+ #
35
+ # @param [Array<String, Pathname>]
36
+ attr_reader :args
37
+
38
+ # Initialize and run a new Wukong::Hadoop::Driver for the given
39
+ # +settings+.
40
+ #
41
+ # Will rescue all Wukong::Error exceptions by printing a nice
42
+ # message to STDERR and exiting.
43
+ #
44
+ # @param [Configliere::Param] settings
45
+ # @param [Array<String>] extra_args
46
+ def self.run(settings, *extra_args)
47
+ begin
48
+ new(settings, *extra_args).run!
49
+ rescue Wukong::Error => e
50
+ $stderr.puts e.message
51
+ exit(127)
52
+ end
53
+ end
54
+
55
+ # Run this driver.
56
+ def run!
57
+ if mode == :local
58
+ # Log.info "Launching local!"
59
+ execute_command!(local_commandline)
60
+ else
61
+ ensure_input_and_output!
62
+ remove_output_path! if settings[:rm] || settings[:overwrite]
63
+ Log.info "Launching Hadoop!"
64
+ execute_command!(hadoop_commandline)
65
+ end
66
+ end
67
+
68
+ # Initialize a new driver with the given +settings+ and +args+.
69
+ #
70
+ # @param [Configliere::Param] settings
71
+ # @param [Array<String>] args
72
+ def initialize(settings, *args)
73
+ @settings = settings
74
+ self.args = args
75
+ end
76
+
77
+ # Set the +args+ for this driver.
78
+ #
79
+ # Arguments can be either (registered) processor names or files.
80
+ #
81
+ # An error will be raised on missing files or those which
82
+ # couldn't be loaded.
83
+ #
84
+ # An error will be raised if more than two arguments (mapper and
85
+ # reducer) are passed.
86
+ #
87
+ # @param [Array<String>] args
88
+ def args= args
89
+ raise Error.new("Cannot provide more than two arguments") if args.length > 2
90
+ @args = args.map do |arg|
91
+ if processor_registered?(arg)
92
+ arg
93
+ else
94
+ begin
95
+ rp = Pathname.new(arg).realpath
96
+ load rp
97
+ rp
98
+ rescue => e
99
+ raise Error.new("No such processor or file: #{arg}")
100
+ end
101
+ end
102
+ end
103
+ end
104
+
105
+ # What mode is this driver in?
106
+ #
107
+ # @return [:hadoop, :local]
108
+ def mode
109
+ settings[:mode].to_s == 'local' ? :local : :hadoop
110
+ end
111
+
112
+ # Were mapper and/or reducer named by a single argument?
113
+ #
114
+ # @return [true, false]
115
+ def single_job_arg?
116
+ args.size == 1
117
+ end
118
+
119
+ # Were mapper and/or reducer named by separate arguments?
120
+ #
121
+ # @return [true, false]
122
+ def separate_map_and_reduce_args?
123
+ args.size == 2
124
+ end
125
+
126
+ # Is there a processor registered with the given +name+?
127
+ #
128
+ # @param [#to_s] name
129
+ # @return [true, false]
130
+ def processor_registered? name
131
+ Wukong.registry.registered?(name.to_s.to_sym)
132
+ end
133
+
134
+ # Return the guessed name of a processor at the given +path+.
135
+ #
136
+ # @param [String] path
137
+ # @return [String]
138
+ def processor_name_from_file(path)
139
+ File.basename(path, '.rb')
140
+ end
141
+
142
+ # Does the given +path+ contain a processor named after itself?
143
+ #
144
+ # @param [String] path
145
+ # @return [true, false]
146
+ def file_is_processor?(path)
147
+ processor_registered?(processor_name_from_file(path))
148
+ end
149
+
150
+ # The prefix to insert befor all invocations of the
151
+ # <tt>wu-local</tt> runner.
152
+ #
153
+ # @return [String]
154
+ def command_prefix
155
+ settings[:command_prefix]
156
+ end
157
+
158
+ # Returns parameters to pass to an invocation of
159
+ # <tt>wu-local</tt>.
160
+ #
161
+ # Parameters like <tt>--reduce_tasks</tt> which are relevant to
162
+ # Wukong-Hadoop will be interpreted and *not* passed. Others
163
+ # will be passed unmodified.
164
+ #
165
+ # @return [String]
166
+ def params_to_pass
167
+ s = (Wukong.loaded_deploy_pack? ? Deploy.pre_deploy_settings : settings)
168
+ s.reject{ |param, val| s.definition_of(param, :wukong_hadoop) }.map{ |param,val| "--#{param}=#{Shellwords.escape(val.to_s)}" }.join(" ")
169
+ end
170
+
171
+ # Execute a command composed of the given parts.
172
+ #
173
+ # Will print the command instead of the <tt>--dry_run</tt>
174
+ # option was given.
175
+ #
176
+ # @param [Array<String>] args
177
+ def execute_command!(*args)
178
+ command = args.flatten.reject(&:blank?).join(" \\\n ")
179
+ if settings[:dry_run]
180
+ Log.info("Dry run:")
181
+ puts command
182
+ else
183
+ puts `#{command}`
184
+ raise "Streaming command failed!" unless $?.success?
185
+ end
186
+ end
187
+
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,184 @@
1
+ module Wukong
2
+ module Hadoop
3
+
4
+ # Provides methods for executing a map/reduce job on a Hadoop
5
+ # cluster via {Hadoop
6
+ # streaming}[http://hadoop.apache.org/docs/r0.15.2/streaming.html].
7
+ module HadoopInvocation
8
+
9
+ # Raise an error unless we have input and output.
10
+ def ensure_input_and_output!
11
+ raise Error.new("Explicit --input and --output paths are required to run a job in Hadoop mode.") if input_paths.nil? || input_paths.empty? || output_path.nil? || output_path.empty?
12
+ end
13
+
14
+ # Remove the output path.
15
+ #
16
+ # Will not actually do anything if the <tt>--dry_run</tt> option
17
+ # is also given.
18
+ def remove_output_path!
19
+ cmd = %Q{#{settings[:hadoop_runner]} fs -rmr '#{output_path}'}
20
+ Log.info "Removing output file #{output_path}: #{cmd}"
21
+ puts `#{cmd}` unless settings[:dry_run]
22
+ end
23
+
24
+ # Return the Hadoop command used to launch this job in a Hadoop
25
+ # cluster.
26
+ #
27
+ # You should be able to copy, paste, and run this command
28
+ # unmodified when debugging.
29
+ #
30
+ # @return [String]
31
+ def hadoop_commandline
32
+ [
33
+ hadoop_runner,
34
+ "jar #{settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
35
+ hadoop_jobconf_options,
36
+ "-D mapred.job.name='#{job_name}'",
37
+ hadoop_other_args,
38
+ "-mapper '#{mapper_commandline}'",
39
+ "-reducer '#{reducer_commandline}'",
40
+ "-input '#{input_paths}'",
41
+ "-output '#{output_path}'",
42
+ hadoop_files,
43
+ io_formats,
44
+ hadoop_recycle_env,
45
+ ].flatten.compact.join(" \t\\\n ")
46
+ end
47
+
48
+ # The job name that will be passed to Hadoop.
49
+ #
50
+ # Respects the <tt>--job_name</tt> option if given, otherwise
51
+ # constructs one from the given processors, input, and output
52
+ # paths.
53
+ #
54
+ # @return [String]
55
+ def job_name
56
+ return settings[:job_name] if settings[:job_name]
57
+ relevant_filename = args.compact.uniq.map { |path| File.basename(path, '.rb') }.join('-')
58
+ "#{relevant_filename}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
59
+ end
60
+
61
+ # The input format to use.
62
+ #
63
+ # Respects the value of <tt>--input_format</tt>.
64
+ #
65
+ # @return [String]
66
+ def input_format
67
+ settings[:input_format]
68
+ end
69
+
70
+ # The output format to use.
71
+ #
72
+ # Respects the value of <tt>--output_format</tt>.
73
+ #
74
+ # @return [String]
75
+ def output_format
76
+ settings[:output_format]
77
+ end
78
+
79
+ # :nodoc:
80
+ def io_formats
81
+ input = "-inputformat '#{input_format}'" if input_format
82
+ output = "-outputformat '#{output_format}'" if output_format
83
+ [input, output]
84
+ end
85
+
86
+ # The name of the Hadoop binary to use.
87
+ #
88
+ # Respects the value of <tt>--hadoop_runner</tt> if given.
89
+ #
90
+ # @return [String]
91
+ def hadoop_runner
92
+ settings[:hadoop_runner] || File.join(settings[:hadoop_home], 'bin/hadoop')
93
+ end
94
+
95
+ # Return an array of jobconf (-D) options that will be passed to Hadoop.
96
+ #
97
+ # Translates the "friendly" <tt>wu-hadoop</tt> names into the
98
+ # less-friendly Hadoop names.
99
+ #
100
+ # @return [Array<String>]
101
+ def hadoop_jobconf_options
102
+ jobconf_options = []
103
+ settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
104
+ settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
105
+ # If no reducer and no reduce_command, then skip the reduce phase
106
+ settings[:reduce_tasks] = 0 unless (reduce? || settings[:reduce_tasks].nil?)
107
+ # Fields hadoop should use to distribute records to reducers
108
+ unless settings[:partition_fields].blank?
109
+ jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
110
+ end
111
+ jobconf_options += [
112
+ :io_sort_mb, :io_sort_record_percent,
113
+ :map_speculative, :map_tasks,
114
+ :max_maps_per_cluster, :max_maps_per_node,
115
+ :max_node_map_tasks, :max_node_reduce_tasks,
116
+ :max_reduces_per_cluster, :max_reduces_per_node,
117
+ :max_record_length, :min_split_size,
118
+ :output_field_separator, :key_field_separator,
119
+ :partition_fields, :sort_fields,
120
+ :reduce_tasks, :respect_exit_status,
121
+ :reuse_jvms, :timeout,
122
+ :max_tracker_failures, :max_map_attempts,
123
+ :max_reduce_attempts
124
+ ].map do |opt|
125
+ defn = settings.definition_of(opt, :description)
126
+ val = settings[opt]
127
+ java_opt(defn, val)
128
+ end
129
+ jobconf_options.flatten.compact
130
+ end
131
+
132
+ # Returns other arguments used by Hadoop streaming.
133
+ #
134
+ # @return [String]
135
+ def hadoop_other_args
136
+ extra_str_args = parsed_java_opts
137
+ if settings[:split_on_xml_tag]
138
+ extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{options.split_on_xml_tag}>,end=</#{options.split_on_xml_tag}>'}
139
+ end
140
+ extra_str_args << ' -lazyOutput' if settings[:noempty] # don't create reduce file if no records
141
+ extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless settings[:partition_fields].blank?
142
+ extra_str_args
143
+ end
144
+
145
+ # :nodoc:
146
+ #
147
+ # http://hadoop.apache.org/docs/r0.20.2/streaming.html#Package+Files+With+Job+Submissions
148
+ def hadoop_files
149
+ args.find_all { |arg| arg.to_s =~ /\.rb$/ }.map do |arg|
150
+ "-file '#{arg}'"
151
+ end
152
+ end
153
+
154
+ # :nodoc:
155
+ def ruby_interpreter_path
156
+ Pathname.new(File.join(Config::CONFIG['bindir'], Config::CONFIG['RUBY_INSTALL_NAME'] + Config::CONFIG['EXEEXT'])).realpath
157
+ end
158
+
159
+ # :nodoc:
160
+ def use_alternative_gemfile
161
+ ENV['BUNDLE_GEMFILE'] = settings[:gemfile]
162
+ end
163
+
164
+ # :nodoc:
165
+ def hadoop_recycle_env
166
+ use_alternative_gemfile if settings[:gemfile]
167
+ %w[BUNDLE_GEMFILE].map{ |var| %Q{-cmdenv '#{var}=#{ENV[var]}'} if ENV[var] }.compact
168
+ end
169
+
170
+ # :nodoc:
171
+ def parsed_java_opts
172
+ settings[:java_opts].map do |java_opt|
173
+ java_opt.split('-D').reject{ |opt| opt.blank? }.map{ |opt| '-D ' + opt.strip }
174
+ end.flatten
175
+ end
176
+
177
+ # :nodoc:
178
+ def java_opt option, value
179
+ "-D %s=%s" % [option, Shellwords.escape(value.to_s)] if value
180
+ end
181
+
182
+ end
183
+ end
184
+ end
@@ -0,0 +1,27 @@
1
+ module Wukong
2
+ module Hadoop
3
+
4
+ # Provides methods for determining input and output paths.
5
+ # Written as a separate module to allow easy overriding from other
6
+ # plugins.
7
+ module InputsAndOutputs
8
+
9
+ # The input paths to read from.
10
+ #
11
+ # @return [String]
12
+ def input_paths
13
+ (settings[:input] || [])
14
+ end
15
+
16
+ # The output path to write to.
17
+ #
18
+ # @return [String]
19
+ def output_path
20
+ settings[:output]
21
+ end
22
+
23
+ end
24
+ end
25
+ end
26
+
27
+
@@ -0,0 +1,48 @@
1
+ require 'shellwords'
2
+ module Wukong
3
+ module Hadoop
4
+
5
+ # Provides methods for executing a map/reduce job locally on the
6
+ # command-line.
7
+ module LocalInvocation
8
+
9
+ # Returns the full local command used by Wukong-Hadoop when
10
+ # simulating a map/reduce job on the command-line.
11
+ #
12
+ # You should be able to run this commmand directly to simulate
13
+ # the job yourself.
14
+ #
15
+ # @return [String]
16
+ def local_commandline
17
+ [
18
+ [cat_input, mapper_commandline].tap do |pipeline|
19
+ pipeline.concat([sort_commandline, reducer_commandline]) if reduce?
20
+ end.flatten.compact.join(' | '),
21
+ cat_output
22
+ ].flatten.compact.join(' ')
23
+ end
24
+
25
+ # Returns the sort command used by Wukong-Hadoop when simulating
26
+ # a map/reduce job on the command-line.
27
+ #
28
+ # @return [String]
29
+ def sort_commandline
30
+ settings[:sort_command]
31
+ end
32
+
33
+ # :nodoc:
34
+ def cat_input
35
+ return unless input_paths && (!input_paths.empty?)
36
+ paths = Shellwords.join(input_paths.split(','))
37
+ "cat #{paths}"
38
+ end
39
+
40
+ # :nodoc:
41
+ def cat_output
42
+ return unless output_path
43
+ "> #{output_path}"
44
+ end
45
+
46
+ end
47
+ end
48
+ end