wukong-hadoop 0.0.2 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,137 +0,0 @@
1
- module Wukong
2
- module Hadoop
3
-
4
- # Configure the given settings object for use with Wukong::Hadoop.
5
- #
6
- # @param [Configliere::Param] settings the settings to configure
7
- # @return [Configliere::Param the configured settings
8
- def self.configure settings
9
- # Hadoop Options
10
- settings.define :hadoop_home, wukong_hadoop: true, description: 'Path to hadoop installation. HADOOP_HOME/bin/hadoop is used to run hadoop.', env_var: 'HADOOP_HOME', default: '/usr/lib/hadoop'
11
- settings.define :hadoop_runner, wukong_hadoop: true, description: 'Path to hadoop executable. Use this for non-standard hadoop installations.'
12
-
13
- # Translate simplified args to their hairy hadoop equivalents
14
- settings.define :io_sort_mb, wukong_hadoop: true, jobconf: true, description: 'io.sort.mb'
15
- settings.define :io_sort_record_percent, wukong_hadoop: true, jobconf: true, description: 'io.sort.record.percent'
16
- settings.define :job_name, wukong_hadoop: true, jobconf: true, description: 'mapred.job.name'
17
- settings.define :key_field_separator, wukong_hadoop: true, jobconf: true, description: 'map.output.key.field.separator'
18
- settings.define :map_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks.speculative.execution'
19
- settings.define :reduce_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks.speculative.execution'
20
- settings.define :map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks'
21
- settings.define :max_maps_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.cluster'
22
- settings.define :max_maps_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.node'
23
- settings.define :max_node_map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.tasktracker.map.tasks.maximum'
24
- settings.define :max_node_reduce_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.tasktracker.reduce.tasks.maximum'
25
- settings.define :max_record_length, wukong_hadoop: true, jobconf: true, description: 'mapred.linerecordreader.maxlength'
26
- settings.define :max_reduces_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.reduces.per.cluster'
27
- settings.define :max_reduces_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.reduces.per.node'
28
- settings.define :max_tracker_failures, wukong_hadoop: true, jobconf: true, description: 'mapred.max.tracker.failures'
29
- settings.define :max_map_attempts, wukong_hadoop: true, jobconf: true, description: 'mapred.map.max.attempts'
30
- settings.define :max_reduce_attempts, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.max.attempts'
31
- settings.define :min_split_size, wukong_hadoop: true, jobconf: true, description: 'mapred.min.split.size'
32
- settings.define :output_field_separator, wukong_hadoop: true, jobconf: true, description: 'stream.map.output.field.separator'
33
- settings.define :partition_fields, wukong_hadoop: true, jobconf: true, description: 'num.key.fields.for.partition'
34
- settings.define :reduce_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks'
35
- settings.define :respect_exit_status, wukong_hadoop: true, jobconf: true, description: 'stream.non.zero.exit.is.failure'
36
- settings.define :reuse_jvms, wukong_hadoop: true, jobconf: true, description: 'mapred.job.reuse.jvm.num.tasks'
37
- settings.define :sort_fields, wukong_hadoop: true, jobconf: true, description: 'stream.num.map.output.key.fields'
38
- settings.define :timeout, wukong_hadoop: true, jobconf: true, description: 'mapred.task.timeout'
39
- settings.define :noempty, wukong_hadoop: true, description: "Don't create zero-byte reduce files"
40
- settings.define :split_on_xml_tag, wukong_hadoop: true, description: "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
41
- settings.define :input_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative InputFormat.'
42
- settings.define :output_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative OutputFormat.'
43
- settings.define :java_opts, wukong_hadoop: true, description: 'Additional Java options to be passed to hadoop streaming.', :type => Array, :default => []
44
- settings.define :files, wukong_hadoop: true, description: "Comma-separated list of files (or globs) to be copied to the MapReduce cluster (-files).", :type => Array, :default => []
45
- settings.define :jars, wukong_hadoop: true, description: "Comma-separated list of jars (or globs) to include on the Hadoop CLASSPATH (-libjars).", :type => Array, :default => []
46
- settings.define :archives, wukong_hadoop: true, description: "Comma-separated list of archives to be unarchived on each worker (-archives).", :type => Array, :default => []
47
-
48
- # Options given on the command-line
49
- settings.define :mode, description: "Run in either 'hadoop' or 'local' mode", wukong_hadoop: true, :default => 'hadoop'
50
- settings.define :map_command, description: "Shell command to run as mapper, in place of a constructed wu-local command", wukong_hadoop: true
51
- settings.define :reduce_command, description: "Shell command to run as reducer, in place of a constructed wu-local command", wukong_hadoop: true
52
- settings.define :sort_command, description: "Shell command to run as sorter (only in `local' mode)", wukong_hadoop: true, :default => 'sort'
53
- settings.define :command_prefix, description: "Prefex to insert before all Wukong commands", wukong_hadoop: true
54
- settings.define :mapper, description: "Name of processor to use as a mapper", wukong_hadoop: true
55
- settings.define :reducer, description: "Name of processor to use as a reducer", wukong_hadoop: true
56
- settings.define :gemfile, description: "Specify an alternative Gemfile to execute this wukong script with", wukong_hadoop: true
57
- settings.define :dry_run, description: "Echo the command that will be run, but don't run it", wukong_hadoop: true, :type => :boolean, :default => false
58
- settings.define :rm, description: "Recursively remove the destination directory.", wukong_hadoop: true, :type => :boolean, :default => false
59
- settings.define :input, description: "Comma-separated list of input paths", wukong_hadoop: true
60
- settings.define :output, description: "Output path.", wukong_hadoop: true
61
-
62
- settings.use(:commandline)
63
-
64
- def settings.usage()
65
- "usage: #{File.basename($0)} PROCESSOR|FLOW [PROCESSOR|FLOW] [ --param=value | -p value | --param | -p]"
66
- end
67
-
68
- settings.description = <<EOF
69
- wu-hadoop is a tool to model and launch Wukong processors as
70
- map/reduce workflows within the Hadoop framework.
71
-
72
- Use wu-hadoop with existing processors in `local' mode to test the
73
- logic of your job, reading from the specified --input and printing to
74
- STDOUT:
75
-
76
- $ wu-hadoop examples/word_count.rb --mode=local --input=examples/sonnet_18.txt
77
- a 2
78
- all 1
79
- and 2
80
- ...
81
-
82
- where it is assumed that your mapper is called 'mapper' and your
83
- reducer 'reducer'. You can also cat in data:
84
-
85
- $ cat examples/sonnet_18.txt | wu-hadoop examples/word_count.rb --mode=local
86
-
87
- Or pass options directly:
88
-
89
- $ wu-hadoop examples/word_count.rb --mode=local --input=examples/sonnet_18.txt --fold_case --min_length=3
90
- all 1
91
- and 5
92
- art 1
93
- brag 1
94
- ...
95
-
96
- Or define both processors in separate files:
97
-
98
- $ wu-hadoop examples/tokenizer.rb examples/counter.rb --mode=local --input=examples/sonnet_18.txt
99
-
100
- Or by name:
101
-
102
- $ wu-hadoop examples/processors.rb --mode=local --input=examples/sonnet_18.txt --mapper=tokenizer --reducer=counter
103
-
104
- Or just by command:
105
-
106
- $ wu-hadoop processors.rb --mapper=tokenizer --reduce_command='uniq -c' ...
107
- $ wu-hadoop processors.rb --map_command='cut -f3' --reducer=counter ...
108
- $ wu-hadoop --map_command='cut -f3' --reduce_command='uniq -c' ...
109
-
110
- If you don't specify a --reducer explicitly, and you didn't give two
111
- separate arguments, and no processor named :reducer exists in the
112
- environment, then we assume you are launching a map-only job and
113
- 'mapred.tasktracker.reduce.tasks.maximum' will correspondingly be set
114
- to 0:
115
-
116
- $ wu-hadoop examples/tokenizer.rb --mode=local --input=examples/sonnet_18.txt
117
- Shall
118
- I
119
- compare
120
- thee
121
- ...
122
-
123
- You can achieve this directly with the --reduce_tasks=0 option.
124
-
125
- Many other Hadoop options have been wrapped with similarly friendly
126
- names below. These are ignored when running in `local' mode.
127
-
128
- Some options (like `--sort_command') only make sense in `local' mode.
129
- These are ignored in `hadoop' mode.
130
- EOF
131
- settings
132
- end
133
-
134
- # All Hadoop configuration for Wukong lives within this object.
135
- Configuration = configure(Configliere::Param.new) unless defined? Configuration
136
- end
137
- end
@@ -1,191 +0,0 @@
1
- require 'shellwords'
2
- require_relative("driver/inputs_and_outputs")
3
- require_relative("driver/map_logic")
4
- require_relative("driver/reduce_logic")
5
- require_relative("driver/local_invocation")
6
- require_relative("driver/hadoop_invocation")
7
-
8
- module Wukong
9
- module Hadoop
10
-
11
- # The <tt>Hadoop::Driver</tt> class contains the logic to examine
12
- # arguments and construct command lines which it will execute to
13
- # create the desired behavior.
14
- #
15
- # The Hadoop::Driver will introspect on its arguments to guess (if
16
- # not given) the processors to use as mapper and reducer in a
17
- # map/reduce job. It will also decide whether to run that job in
18
- # local or Hadoop mode. These decisions result in a command which
19
- # it will ultimately execute.
20
- class Driver < Wukong::Driver
21
-
22
- include InputsAndOutputs
23
- include MapLogic
24
- include ReduceLogic
25
- include HadoopInvocation
26
- include LocalInvocation
27
- include Logging
28
-
29
- # The settings used by this driver.
30
- #
31
- # @param [Configliere::Param]
32
- attr_accessor :settings
33
-
34
- # The (processed) arguments for this driver.
35
- #
36
- # @param [Array<String, Pathname>]
37
- attr_reader :args
38
-
39
- # Initialize and run a new Wukong::Hadoop::Driver for the given
40
- # +settings+.
41
- #
42
- # Will rescue all Wukong::Error exceptions by printing a nice
43
- # message to STDERR and exiting.
44
- #
45
- # @param [Configliere::Param] settings
46
- # @param [Array<String>] extra_args
47
- def self.run(settings, *extra_args)
48
- begin
49
- new(settings, *extra_args).run!
50
- rescue Wukong::Error => e
51
- $stderr.puts e.message
52
- exit(127)
53
- end
54
- end
55
-
56
- # Run this driver.
57
- def run!
58
- if mode == :local
59
- # log.info "Launching local!"
60
- execute_command!(local_commandline)
61
- else
62
- ensure_input_and_output!
63
- remove_output_path! if settings[:rm] || settings[:overwrite]
64
- log.info "Launching Hadoop!"
65
- execute_command!(hadoop_commandline)
66
- end
67
- end
68
-
69
- # Initialize a new driver with the given +settings+ and +args+.
70
- #
71
- # @param [Configliere::Param] settings
72
- # @param [Array<String>] args
73
- def initialize(settings, *args)
74
- @settings = settings
75
- self.args = args
76
- end
77
-
78
- # Set the +args+ for this driver.
79
- #
80
- # Arguments can be either (registered) processor names or files.
81
- #
82
- # An error will be raised on missing files or those which
83
- # couldn't be loaded.
84
- #
85
- # An error will be raised if more than two arguments (mapper and
86
- # reducer) are passed.
87
- #
88
- # @param [Array<String>] args
89
- def args= args
90
- raise Error.new("Cannot provide more than two arguments") if args.length > 2
91
- @args = args.map do |arg|
92
- if processor_registered?(arg)
93
- arg
94
- else
95
- begin
96
- rp = Pathname.new(arg).realpath
97
- load rp
98
- rp
99
- rescue => e
100
- raise Error.new("No such processor or file: #{arg}")
101
- end
102
- end
103
- end
104
- end
105
-
106
- # What mode is this driver in?
107
- #
108
- # @return [:hadoop, :local]
109
- def mode
110
- settings[:mode].to_s == 'local' ? :local : :hadoop
111
- end
112
-
113
- # Were mapper and/or reducer named by a single argument?
114
- #
115
- # @return [true, false]
116
- def single_job_arg?
117
- args.size == 1
118
- end
119
-
120
- # Were mapper and/or reducer named by separate arguments?
121
- #
122
- # @return [true, false]
123
- def separate_map_and_reduce_args?
124
- args.size == 2
125
- end
126
-
127
- # Is there a processor registered with the given +name+?
128
- #
129
- # @param [#to_s] name
130
- # @return [true, false]
131
- def processor_registered? name
132
- Wukong.registry.registered?(name.to_s.to_sym)
133
- end
134
-
135
- # Return the guessed name of a processor at the given +path+.
136
- #
137
- # @param [String] path
138
- # @return [String]
139
- def processor_name_from_file(path)
140
- File.basename(path, '.rb')
141
- end
142
-
143
- # Does the given +path+ contain a processor named after itself?
144
- #
145
- # @param [String] path
146
- # @return [true, false]
147
- def file_is_processor?(path)
148
- processor_registered?(processor_name_from_file(path))
149
- end
150
-
151
- # The prefix to insert befor all invocations of the
152
- # <tt>wu-local</tt> runner.
153
- #
154
- # @return [String]
155
- def command_prefix
156
- settings[:command_prefix]
157
- end
158
-
159
- # Returns parameters to pass to an invocation of
160
- # <tt>wu-local</tt>.
161
- #
162
- # Parameters like <tt>--reduce_tasks</tt> which are relevant to
163
- # Wukong-Hadoop will be interpreted and *not* passed. Others
164
- # will be passed unmodified.
165
- #
166
- # @return [String]
167
- def params_to_pass
168
- s = (Wukong.loaded_deploy_pack? ? Deploy.pre_deploy_settings : settings)
169
- s.reject{ |param, val| s.definition_of(param, :wukong_hadoop) }.map{ |param,val| "--#{param}=#{Shellwords.escape(val.to_s)}" }.join(" ")
170
- end
171
-
172
- # Execute a command composed of the given parts.
173
- #
174
- # Will print the command instead of the <tt>--dry_run</tt>
175
- # option was given.
176
- #
177
- # @param [Array<String>] args
178
- def execute_command!(*args)
179
- command = args.flatten.reject(&:blank?).join(" \\\n ")
180
- if settings[:dry_run]
181
- log.info("Dry run:")
182
- puts command
183
- else
184
- puts `#{command}`
185
- raise "Streaming command failed!" unless $?.success?
186
- end
187
- end
188
-
189
- end
190
- end
191
- end
@@ -1,27 +0,0 @@
1
- module Wukong
2
- module Hadoop
3
-
4
- # Provides methods for determining input and output paths.
5
- # Written as a separate module to allow easy overriding from other
6
- # plugins.
7
- module InputsAndOutputs
8
-
9
- # The input paths to read from.
10
- #
11
- # @return [String]
12
- def input_paths
13
- (settings[:input] || [])
14
- end
15
-
16
- # The output path to write to.
17
- #
18
- # @return [String]
19
- def output_path
20
- settings[:output]
21
- end
22
-
23
- end
24
- end
25
- end
26
-
27
-
@@ -1,15 +0,0 @@
1
- module Wukong
2
- module Hadoop
3
- module DriverHelper
4
-
5
- def driver *args
6
- params = ::Wukong::Hadoop.configure(Configliere::Param.new)
7
- params.resolve!
8
- params.merge!(args.pop) if args.last.is_a?(Hash)
9
- Wukong::Hadoop::Driver.new(params, *args)
10
- end
11
-
12
- end
13
- end
14
- end
15
-
@@ -1,39 +0,0 @@
1
- module Wukong
2
- module Hadoop
3
- module IntegrationHelper
4
-
5
- def root
6
- @root ||= Pathname.new(File.expand_path('../../..', __FILE__))
7
- end
8
-
9
- def lib_dir *args
10
- root.join('lib', *args)
11
- end
12
-
13
- def bin_dir *args
14
- root.join('bin', *args)
15
- end
16
-
17
- def examples_dir *args
18
- root.join('examples', *args)
19
- end
20
-
21
- def integration_env
22
- {
23
- "PATH" => [bin_dir.to_s, ENV["PATH"]].compact.join(':'),
24
- "RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
25
- }
26
- end
27
-
28
- def integration_cwd
29
- root.to_s
30
- end
31
-
32
- def example_script *args
33
- examples_dir(*args)
34
- end
35
-
36
- end
37
- end
38
- end
39
-