wukong-hadoop 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/.gitignore +59 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +3 -0
  4. data/README.md +339 -0
  5. data/Rakefile +13 -0
  6. data/bin/hdp-bin +44 -0
  7. data/bin/hdp-bzip +23 -0
  8. data/bin/hdp-cat +3 -0
  9. data/bin/hdp-catd +3 -0
  10. data/bin/hdp-cp +3 -0
  11. data/bin/hdp-du +86 -0
  12. data/bin/hdp-get +3 -0
  13. data/bin/hdp-kill +3 -0
  14. data/bin/hdp-kill-task +3 -0
  15. data/bin/hdp-ls +11 -0
  16. data/bin/hdp-mkdir +2 -0
  17. data/bin/hdp-mkdirp +12 -0
  18. data/bin/hdp-mv +3 -0
  19. data/bin/hdp-parts_to_keys.rb +77 -0
  20. data/bin/hdp-ps +3 -0
  21. data/bin/hdp-put +3 -0
  22. data/bin/hdp-rm +32 -0
  23. data/bin/hdp-sort +40 -0
  24. data/bin/hdp-stream +40 -0
  25. data/bin/hdp-stream-flat +22 -0
  26. data/bin/hdp-stream2 +39 -0
  27. data/bin/hdp-sync +17 -0
  28. data/bin/hdp-wc +67 -0
  29. data/bin/wu-hadoop +14 -0
  30. data/examples/counter.rb +17 -0
  31. data/examples/map_only.rb +28 -0
  32. data/examples/processors.rb +4 -0
  33. data/examples/sonnet_18.txt +14 -0
  34. data/examples/tokenizer.rb +28 -0
  35. data/examples/word_count.rb +44 -0
  36. data/features/step_definitions/wu_hadoop_steps.rb +4 -0
  37. data/features/support/env.rb +1 -0
  38. data/features/wu_hadoop.feature +113 -0
  39. data/lib/wukong-hadoop.rb +21 -0
  40. data/lib/wukong-hadoop/configuration.rb +133 -0
  41. data/lib/wukong-hadoop/driver.rb +190 -0
  42. data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
  43. data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
  44. data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
  45. data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
  46. data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
  47. data/lib/wukong-hadoop/extensions.rb +2 -0
  48. data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
  49. data/lib/wukong-hadoop/version.rb +6 -0
  50. data/spec/spec_helper.rb +21 -0
  51. data/spec/support/driver_helper.rb +15 -0
  52. data/spec/support/integration_helper.rb +39 -0
  53. data/spec/wukong-hadoop/driver_spec.rb +117 -0
  54. data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
  55. data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
  56. data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
  57. data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
  58. data/wukong-hadoop.gemspec +33 -0
  59. metadata +168 -0
data/bin/hdp-sync ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+ require 'wukong'
3
+
4
+ src_dir, dest_dir = ARGV[0..1]
5
+ src_files = Dir[src_dir + '/*']
6
+ dest_files = Wukong::Dfs.list_files dest_dir
7
+ Wukong::Dfs.compare_listings(src_files, dest_files) do |comparison, src_file, dest_file|
8
+ case comparison
9
+ when :missing
10
+ dest_filename = "%s/%s" % [dest_dir, dest_file]
11
+ puts "Copying #{src_file} #{dest_filename}"
12
+ puts `hadoop dfs -put #{src_file} #{dest_filename}`
13
+ when :differ
14
+ src_ls = `ls -l #{src_file}`.split(/\s+/).join("\t")
15
+ puts "Differ: #{src_ls} \n#{dest_file}"
16
+ end
17
+ end
data/bin/hdp-wc ADDED
@@ -0,0 +1,67 @@
1
+ #!/usr/bin/env ruby
2
+ require 'wukong'
3
+ NEWLINE_LENGTH = $/.length # KLUDGE
4
+
5
+ #
6
+ #
7
+ #
8
+ # !! The +words+ count comes out higher than that of +wc+ -- don't know
9
+ # why. (It's close: a 10GB, 1M line dataset it showed 367833839 vs. 367713271)
10
+ #
11
+ class WcMapper < Wukong::Streamer::LineStreamer
12
+ attr_accessor :lines, :fields, :words, :chars, :bytes
13
+
14
+ def before_stream
15
+ self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
16
+ end
17
+
18
+ def process line
19
+ return unless line
20
+ self.lines += 1
21
+ self.fields += 1 + line.count("\t")
22
+ self.words += 1 + line.strip.scan(/\s+/).length unless line.blank?
23
+ self.chars += line.chars.to_a.length + NEWLINE_LENGTH
24
+ self.bytes += line.bytesize + NEWLINE_LENGTH
25
+ $stderr.puts line if (line.chars.to_a.length != line.bytesize)
26
+ end
27
+
28
+ def after_stream
29
+ emit [lines, fields, words, chars, bytes]
30
+ end
31
+ end
32
+
33
+ #
34
+ #
35
+ class WcReducer < Wukong::Streamer::Base
36
+ attr_accessor :lines, :fields, :words, :chars, :bytes
37
+
38
+ def before_stream
39
+ self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
40
+ end
41
+
42
+ def process m_lines, m_fields, m_words, m_chars, m_bytes
43
+ self.lines += m_lines.to_i
44
+ self.fields += m_fields.to_i
45
+ self.words += m_words.to_i
46
+ self.chars += m_chars.to_i
47
+ self.bytes += m_bytes.to_i
48
+ end
49
+
50
+ def after_stream
51
+ emit [lines, fields, words, chars, bytes]
52
+ end
53
+ end
54
+
55
+ Wukong::Script.new(WcMapper, WcReducer, :reduce_tasks => 1).run
56
+
57
+ # class FooScript < Wukong::Script
58
+ # def map_command
59
+ # '/usr/bin/wc'
60
+ # end
61
+ # def reduce_command
62
+ # '/bin/cat'
63
+ # end
64
+ # end
65
+ # FooScript.new(nil, nil, :reduce_tasks => 1).run
66
+ #
67
+ # ruby -ne 'wc_v = `echo "#{$_.chomp}" | wc`; gr_v=($_.strip.empty? ? 0 : $_.strip.scan(/\s+/).length + 1 ) ; puts [wc_v.chomp, " ", gr_v, $_.chomp].join("\t")'
data/bin/wu-hadoop ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'wukong-hadoop'
4
+ settings = Wukong::Hadoop::Configuration
5
+ require 'wukong/boot'
6
+ Wukong.boot!(settings)
7
+
8
+ settings[:command_prefix] = 'bundle exec' if Wukong.in_deploy_pack?
9
+
10
+ if settings.rest.empty?
11
+ settings.dump_help
12
+ exit(1)
13
+ end
14
+ Wukong::Hadoop::Driver.run(settings, *settings.rest)
@@ -0,0 +1,17 @@
1
+ Wukong.processor(:counter, Wukong::Processor::Accumulator) do
2
+
3
+ attr_accessor :count
4
+
5
+ def start record
6
+ self.count = 0
7
+ end
8
+
9
+ def accumulate record
10
+ self.count += 1
11
+ end
12
+
13
+ def finalize
14
+ yield [key, count].join("\t")
15
+ end
16
+
17
+ end
@@ -0,0 +1,28 @@
1
+ Wukong.processor(:mapper) do
2
+
3
+ field :min_length, Integer, :default => 1
4
+ field :max_length, Integer, :default => 256
5
+ field :split_on, Regexp, :default => /\s+/
6
+ field :remove, Regexp, :default => /[^a-zA-Z0-9\']+/
7
+ field :fold_case, :boolean, :default => false
8
+
9
+ def process string
10
+ tokenize(string).each do |token|
11
+ yield token if acceptable?(token)
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ def tokenize string
18
+ string.split(split_on).map do |token|
19
+ stripped = token.gsub(remove, '')
20
+ fold_case ? stripped.downcase : stripped
21
+ end
22
+ end
23
+
24
+ def acceptable? token
25
+ (min_length..max_length).include?(token.length)
26
+ end
27
+
28
+ end
@@ -0,0 +1,4 @@
1
+ load File.join(File.dirname(__FILE__), 'tokenizer.rb')
2
+ load File.join(File.dirname(__FILE__), 'counter.rb')
3
+
4
+
@@ -0,0 +1,14 @@
1
+ Shall I compare thee to a summer's day?
2
+ Thou art more lovely and more temperate:
3
+ Rough winds do shake the darling buds of May,
4
+ And summer's lease hath all too short a date:
5
+ Sometime too hot the eye of heaven shines,
6
+ And often is his gold complexion dimm'd;
7
+ And every fair from fair sometime declines,
8
+ By chance or nature's changing course untrimm'd;
9
+ But thy eternal summer shall not fade
10
+ Nor lose possession of that fair thou owest;
11
+ Nor shall Death brag thou wander'st in his shade,
12
+ When in eternal lines to time thou growest:
13
+ So long as men can breathe or eyes can see,
14
+ So long lives this and this gives life to thee.
@@ -0,0 +1,28 @@
1
+ Wukong.processor(:tokenizer) do
2
+
3
+ field :min_length, Integer, :default => 1
4
+ field :max_length, Integer, :default => 256
5
+ field :split_on, Regexp, :default => /\s+/
6
+ field :remove, Regexp, :default => /[^a-zA-Z0-9\']+/
7
+ field :fold_case, :boolean, :default => false
8
+
9
+ def process string
10
+ tokenize(string).each do |token|
11
+ yield token if acceptable?(token)
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ def tokenize string
18
+ string.split(split_on).map do |token|
19
+ stripped = token.gsub(remove, '')
20
+ fold_case ? stripped.downcase : stripped
21
+ end
22
+ end
23
+
24
+ def acceptable? token
25
+ (min_length..max_length).include?(token.length)
26
+ end
27
+
28
+ end
@@ -0,0 +1,44 @@
1
+ Wukong.processor(:mapper) do
2
+
3
+ field :min_length, Integer, :default => 1
4
+ field :max_length, Integer, :default => 256
5
+ field :split_on, Regexp, :default => /\s+/
6
+ field :remove, Regexp, :default => /[^a-zA-Z0-9\']+/
7
+ field :fold_case, :boolean, :default => false
8
+
9
+ def process string
10
+ tokenize(string).each do |token|
11
+ yield token if acceptable?(token)
12
+ end
13
+ end
14
+
15
+ private
16
+
17
+ def tokenize string
18
+ string.split(split_on).map do |token|
19
+ stripped = token.gsub(remove, '')
20
+ fold_case ? stripped.downcase : stripped
21
+ end
22
+ end
23
+
24
+ def acceptable? token
25
+ (min_length..max_length).include?(token.length)
26
+ end
27
+ end
28
+
29
+ Wukong.processor(:reducer, Wukong::Processor::Accumulator) do
30
+
31
+ attr_accessor :count
32
+
33
+ def start record
34
+ self.count = 0
35
+ end
36
+
37
+ def accumulate record
38
+ self.count += 1
39
+ end
40
+
41
+ def finalize
42
+ yield [key, count].join("\t")
43
+ end
44
+ end
@@ -0,0 +1,4 @@
1
+ Given /^a wukong script "(.*?)"$/ do |wu_file|
2
+ Pathname(wu_file).should exist
3
+ write_file(wu_file, File.read(wu_file))
4
+ end
@@ -0,0 +1 @@
1
+ require 'aruba/cucumber'
@@ -0,0 +1,113 @@
1
+ Feature: Run wu-hadoop from the command line
2
+ In order to execute hadoop streaming commands
3
+ As a user of wu-hadoop
4
+ I should be able run wu-hadoop with wukong processors
5
+
6
+ Scenario: Simple wu-hadoop command
7
+ Given a wukong script "examples/word_count.rb"
8
+ When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar `
9
+ Then the output should contain:
10
+ """
11
+ /usr/lib/hadoop/bin/hadoop \
12
+ jar /usr/lib/hadoop/contrib/streaming/hadoop-*streaming*.jar \
13
+ -D mapred.job.name='word_count.rb---/foo---/bar' \
14
+ """
15
+ And the output should match:
16
+ """
17
+ -mapper '.*ruby bundle exec wu-local .*word_count.rb --run=mapper ' \\
18
+ -reducer '.*ruby bundle exec wu-local .*word_count.rb --run=reducer ' \\
19
+ """
20
+ And the output should contain:
21
+ """
22
+ -input '/foo' \
23
+ -output '/bar' \
24
+ """
25
+ And the output should match:
26
+ """
27
+ -file '.*word_count.rb' \\
28
+ -cmdenv 'BUNDLE_GEMFILE=.*wukong-hadoop/Gemfile'
29
+ """
30
+
31
+ Scenario: A wu-hadoop command without an input or output
32
+ Given a wukong script "examples/word_count.rb"
33
+ When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run`
34
+ Then the output should contain:
35
+ """
36
+ Missing values for: input (Comma-separated list of input paths.), output (Output directory for the hdfs.)
37
+ """
38
+
39
+ Scenario: Specifying an alternative gemfile
40
+ Given a wukong script "examples/word_count.rb"
41
+ When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar --gemfile=alt/Gemfile`
42
+ Then the output should contain:
43
+ """
44
+ -cmdenv 'BUNDLE_GEMFILE=alt/Gemfile'
45
+ """
46
+
47
+ Scenario: Skipping the reduce step
48
+ Given a file named "wukong_script.rb" with:
49
+ """
50
+ Wukong.processor(:mapper) do
51
+
52
+ end
53
+ """
54
+ When I run `bundle exec wu-hadoop wukong_script.rb --dry_run --input=/foo --output=/bar`
55
+ Then the output should contain:
56
+ """
57
+ -D mapred.reduce.tasks=0 \
58
+ """
59
+
60
+ Scenario: A processor without a mapper
61
+ Given a file named "wukong_script.rb" with:
62
+ """
63
+ Wukong.processor(:reducer) do
64
+
65
+ end
66
+ """
67
+ When I run `bundle exec wu-hadoop wukong_script.rb --dry_run --input=/foo --output=/bar`
68
+ Then the output should match:
69
+ """
70
+ No :mapper definition found in .*wukong_script.rb
71
+ """
72
+
73
+ Scenario: Translating hadoop jobconf options
74
+ Given a wukong script "examples/word_count.rb"
75
+ When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar --max_tracker_failures=12`
76
+ Then the output should match:
77
+ """
78
+ -D mapred.max.tracker.failures=12 \\
79
+ """
80
+
81
+ Scenario: Passing along extra configuration options
82
+ Given a wukong script "examples/word_count.rb"
83
+ When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar --foo=bar`
84
+ Then the output should match:
85
+ """
86
+ -mapper '.* --foo=bar' \\
87
+ -reducer '.* --foo=bar' \\
88
+ """
89
+
90
+ Scenario: Specifying input and output formats
91
+ Given a wukong script "examples/word_count.rb"
92
+ When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar --input_format=com.foo.BarInputFormat`
93
+ Then the output should contain:
94
+ """
95
+ -inputformat 'com.foo.BarInputFormat' \
96
+ """
97
+
98
+ Scenario: Specifying additional java options
99
+ Given a wukong script "examples/word_count.rb"
100
+ When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar --java_opts=-Dfoo.bar=baz,-Dother.opts=cool`
101
+ Then the output should contain:
102
+ """
103
+ -D foo.bar=baz \
104
+ -D other.opts=cool \
105
+ """
106
+
107
+ Scenario: Failed hadoop job
108
+ Given a wukong script "examples/word_count.rb"
109
+ When I run `bundle exec wu-hadoop examples/word_count.rb --input=/foo --output=/bar`
110
+ Then the output should contain:
111
+ """
112
+ Streaming command failed!
113
+ """
@@ -0,0 +1,21 @@
1
+ require 'configliere'
2
+ require 'pathname'
3
+ require 'rbconfig'
4
+ require 'wukong'
5
+
6
+ module Wukong
7
+ # Wukong-Hadoop is a plugin for Wukong that lets you develop, test,
8
+ # and run map/reduce type workflows both locally and in the context
9
+ # of a Hadoop cluster.
10
+ #
11
+ # It comes with a binary program called <tt>wu-hadoop</tt> which
12
+ # lets you execute Ruby files containing Wukong processors as well
13
+ # as built-in Wukong widgets.
14
+ module Hadoop
15
+ end
16
+ end
17
+
18
+
19
+ require 'wukong-hadoop/configuration'
20
+ require 'wukong-hadoop/driver'
21
+ require 'wukong-hadoop/extensions'
@@ -0,0 +1,133 @@
1
+ module Wukong
2
+ module Hadoop
3
+
4
+ # Configure the given settings object for use with Wukong::Hadoop.
5
+ #
6
+ # @param [Configliere::Param] settings the settings to configure
7
+ # @return [Configliere::Param the configured settings
8
+ def self.configure settings
9
+ # Hadoop Options
10
+ settings.define :hadoop_home, wukong_hadoop: true, description: 'Path to hadoop installation. HADOOP_HOME/bin/hadoop is used to run hadoop.', env_var: 'HADOOP_HOME', default: '/usr/lib/hadoop'
11
+ settings.define :hadoop_runner, wukong_hadoop: true, description: 'Path to hadoop executable. Use this for non-standard hadoop installations.'
12
+
13
+ # Translate simplified args to their hairy hadoop equivalents
14
+ settings.define :io_sort_mb, wukong_hadoop: true, jobconf: true, description: 'io.sort.mb'
15
+ settings.define :io_sort_record_percent, wukong_hadoop: true, jobconf: true, description: 'io.sort.record.percent'
16
+ settings.define :job_name, wukong_hadoop: true, jobconf: true, description: 'mapred.job.name'
17
+ settings.define :key_field_separator, wukong_hadoop: true, jobconf: true, description: 'map.output.key.field.separator'
18
+ settings.define :map_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks.speculative.execution'
19
+ settings.define :map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks'
20
+ settings.define :max_maps_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.cluster'
21
+ settings.define :max_maps_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.node'
22
+ settings.define :max_node_map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.tasktracker.map.tasks.maximum'
23
+ settings.define :max_node_reduce_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.tasktracker.reduce.tasks.maximum'
24
+ settings.define :max_record_length, wukong_hadoop: true, jobconf: true, description: 'mapred.linerecordreader.maxlength'
25
+ settings.define :max_reduces_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.reduces.per.cluster'
26
+ settings.define :max_reduces_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.reduces.per.node'
27
+ settings.define :max_tracker_failures, wukong_hadoop: true, jobconf: true, description: 'mapred.max.tracker.failures'
28
+ settings.define :max_map_attempts, wukong_hadoop: true, jobconf: true, description: 'mapred.map.max.attempts'
29
+ settings.define :max_reduce_attempts, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.max.attempts'
30
+ settings.define :min_split_size, wukong_hadoop: true, jobconf: true, description: 'mapred.min.split.size'
31
+ settings.define :output_field_separator, wukong_hadoop: true, jobconf: true, description: 'stream.map.output.field.separator'
32
+ settings.define :partition_fields, wukong_hadoop: true, jobconf: true, description: 'num.key.fields.for.partition'
33
+ settings.define :reduce_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks'
34
+ settings.define :respect_exit_status, wukong_hadoop: true, jobconf: true, description: 'stream.non.zero.exit.is.failure'
35
+ settings.define :reuse_jvms, wukong_hadoop: true, jobconf: true, description: 'mapred.job.reuse.jvm.num.tasks'
36
+ settings.define :sort_fields, wukong_hadoop: true, jobconf: true, description: 'stream.num.map.output.key.fields'
37
+ settings.define :timeout, wukong_hadoop: true, jobconf: true, description: 'mapred.task.timeout'
38
+ settings.define :noempty, wukong_hadoop: true, description: "Don't create zero-byte reduce files"
39
+ settings.define :split_on_xml_tag, wukong_hadoop: true, description: "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
40
+ settings.define :input_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative InputFormat.'
41
+ settings.define :output_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative OutputFormat.'
42
+ settings.define :java_opts, wukong_hadoop: true, description: 'Additional java options to be passed to hadoop streaming.', :type => Array, :default => []
43
+
44
+ # Options given on the command-line
45
+ settings.define :mode, description: "Run in either 'hadoop' or 'local' mode", wukong_hadoop: true, :default => 'hadoop'
46
+ settings.define :map_command, description: "Shell command to run as mapper, in place of a constructed wu-local command", wukong_hadoop: true
47
+ settings.define :reduce_command, description: "Shell command to run as reducer, in place of a constructed wu-local command", wukong_hadoop: true
48
+ settings.define :sort_command, description: "Shell command to run as sorter (only in `local' mode)", wukong_hadoop: true, :default => 'sort'
49
+ settings.define :command_prefix, description: "Prefex to insert before all Wukong commands", wukong_hadoop: true
50
+ settings.define :mapper, description: "Name of processor to use as a mapper", wukong_hadoop: true
51
+ settings.define :reducer, description: "Name of processor to use as a reducer", wukong_hadoop: true
52
+ settings.define :gemfile, description: "Specify an alternative Gemfile to execute this wukong script with", wukong_hadoop: true
53
+ settings.define :dry_run, description: "Echo the command that will be run, but don't run it", wukong_hadoop: true, :type => :boolean, :default => false
54
+ settings.define :rm, description: "Recursively remove the destination directory.", wukong_hadoop: true, :type => :boolean, :default => false
55
+ settings.define :input, description: "Comma-separated list of input paths", wukong_hadoop: true
56
+ settings.define :output, description: "Output path.", wukong_hadoop: true
57
+
58
+ settings.use(:commandline)
59
+
60
+ def settings.usage()
61
+ "usage: #{File.basename($0)} PROCESSOR|FLOW [PROCESSOR|FLOW] [ --param=value | -p value | --param | -p]"
62
+ end
63
+
64
+ settings.description = <<EOF
65
+ wu-hadoop is a tool to model and launch Wukong processors as
66
+ map/reduce workflows within the Hadoop framework.
67
+
68
+ Use wu-hadoop with existing processors in `local' mode to test the
69
+ logic of your job, reading from the specified --input and printing to
70
+ STDOUT:
71
+
72
+ $ wu-hadoop examples/word_count.rb --mode=local --input=examples/sonnet_18.txt
73
+ a 2
74
+ all 1
75
+ and 2
76
+ ...
77
+
78
+ where it is assumed that your mapper is called 'mapper' and your
79
+ reducer 'reducer'. You can also cat in data:
80
+
81
+ $ cat examples/sonnet_18.txt | wu-hadoop examples/word_count.rb --mode=local
82
+
83
+ Or pass options directly:
84
+
85
+ $ wu-hadoop examples/word_count.rb --mode=local --input=examples/sonnet_18.txt --fold_case --min_length=3
86
+ all 1
87
+ and 5
88
+ art 1
89
+ brag 1
90
+ ...
91
+
92
+ Or define both processors in separate files:
93
+
94
+ $ wu-hadoop examples/tokenizer.rb examples/counter.rb --mode=local --input=examples/sonnet_18.txt
95
+
96
+ Or by name:
97
+
98
+ $ wu-hadoop examples/processors.rb --mode=local --input=examples/sonnet_18.txt --mapper=tokenizer --reducer=counter
99
+
100
+ Or just by command:
101
+
102
+ $ wu-hadoop processors.rb --mapper=tokenizer --reduce_command='uniq -c' ...
103
+ $ wu-hadoop processors.rb --map_command='cut -f3' --reducer=counter ...
104
+ $ wu-hadoop --map_command='cut -f3' --reduce_command='uniq -c' ...
105
+
106
+ If you don't specify a --reducer explicitly, and you didn't give two
107
+ separate arguments, and no processor named :reducer exists in the
108
+ environment, then we assume you are launching a map-only job and
109
+ 'mapred.tasktracker.reduce.tasks.maximum' will correspondingly be set
110
+ to 0:
111
+
112
+ $ wu-hadoop examples/tokenizer.rb --mode=local --input=examples/sonnet_18.txt
113
+ Shall
114
+ I
115
+ compare
116
+ thee
117
+ ...
118
+
119
+ You can achieve this directly with the --reduce_tasks=0 option.
120
+
121
+ Many other Hadoop options have been wrapped with similarly friendly
122
+ names below. These are ignored when running in `local' mode.
123
+
124
+ Some options (like `--sort_command') only make sense in `local' mode.
125
+ These are ignored in `hadoop' mode.
126
+ EOF
127
+ settings
128
+ end
129
+
130
+ # All Hadoop configuration for Wukong lives within this object.
131
+ Configuration = configure(Configliere::Param.new) unless defined? Configuration
132
+ end
133
+ end