wukong-hadoop 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/.gitignore +59 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +3 -0
  4. data/README.md +339 -0
  5. data/Rakefile +13 -0
  6. data/bin/hdp-bin +44 -0
  7. data/bin/hdp-bzip +23 -0
  8. data/bin/hdp-cat +3 -0
  9. data/bin/hdp-catd +3 -0
  10. data/bin/hdp-cp +3 -0
  11. data/bin/hdp-du +86 -0
  12. data/bin/hdp-get +3 -0
  13. data/bin/hdp-kill +3 -0
  14. data/bin/hdp-kill-task +3 -0
  15. data/bin/hdp-ls +11 -0
  16. data/bin/hdp-mkdir +2 -0
  17. data/bin/hdp-mkdirp +12 -0
  18. data/bin/hdp-mv +3 -0
  19. data/bin/hdp-parts_to_keys.rb +77 -0
  20. data/bin/hdp-ps +3 -0
  21. data/bin/hdp-put +3 -0
  22. data/bin/hdp-rm +32 -0
  23. data/bin/hdp-sort +40 -0
  24. data/bin/hdp-stream +40 -0
  25. data/bin/hdp-stream-flat +22 -0
  26. data/bin/hdp-stream2 +39 -0
  27. data/bin/hdp-sync +17 -0
  28. data/bin/hdp-wc +67 -0
  29. data/bin/wu-hadoop +14 -0
  30. data/examples/counter.rb +17 -0
  31. data/examples/map_only.rb +28 -0
  32. data/examples/processors.rb +4 -0
  33. data/examples/sonnet_18.txt +14 -0
  34. data/examples/tokenizer.rb +28 -0
  35. data/examples/word_count.rb +44 -0
  36. data/features/step_definitions/wu_hadoop_steps.rb +4 -0
  37. data/features/support/env.rb +1 -0
  38. data/features/wu_hadoop.feature +113 -0
  39. data/lib/wukong-hadoop.rb +21 -0
  40. data/lib/wukong-hadoop/configuration.rb +133 -0
  41. data/lib/wukong-hadoop/driver.rb +190 -0
  42. data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
  43. data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
  44. data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
  45. data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
  46. data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
  47. data/lib/wukong-hadoop/extensions.rb +2 -0
  48. data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
  49. data/lib/wukong-hadoop/version.rb +6 -0
  50. data/spec/spec_helper.rb +21 -0
  51. data/spec/support/driver_helper.rb +15 -0
  52. data/spec/support/integration_helper.rb +39 -0
  53. data/spec/wukong-hadoop/driver_spec.rb +117 -0
  54. data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
  55. data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
  56. data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
  57. data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
  58. data/wukong-hadoop.gemspec +33 -0
  59. metadata +168 -0
@@ -0,0 +1,104 @@
1
+ module Wukong
2
+ module Hadoop
3
+
4
+ # Implements logic for figuring out the correct mapper commandline
5
+ # given wu-hadoop's arguments.
6
+ module MapLogic
7
+
8
+ # Return the actual commandline used by the mapper, whether
9
+ # running in local or Hadoop mode.
10
+ #
11
+ # You should be able to copy, paste, and run this command
12
+ # unmodified to debug the mapper.
13
+ #
14
+ # @return [String]
15
+ def mapper_commandline
16
+ return settings[:map_command] if explicit_map_command?
17
+ [command_prefix, 'wu-local', mapper_arg].tap do |cmd|
18
+ cmd << "--run=#{mapper_name}" if mapper_needs_run_arg?
19
+ cmd << params_to_pass
20
+ end.compact.map(&:to_s).reject(&:empty?).join(' ')
21
+ end
22
+
23
+ # Were we given an explicit map command (like 'cut -f 1') or are
24
+ # we to introspect and construct the command?
25
+ #
26
+ # @return [true, false]
27
+ def explicit_map_command?
28
+ settings[:map_command]
29
+ end
30
+
31
+ # Were we given a processor to use as our mapper explicitly by
32
+ # name or are we to introspect to discover the correct
33
+ # processor?
34
+ #
35
+ # @return [true, false]
36
+ def explicit_map_processor?
37
+ settings[:mapper]
38
+ end
39
+
40
+ # Were we given an explicit mapper (either as a command or as a
41
+ # processor) or should we introspect to find one?
42
+ #
43
+ # @return [true, false]
44
+ def explicit_mapper?
45
+ explicit_map_processor? || explicit_map_command?
46
+ end
47
+
48
+ # The argument that we should introspect on to turn into our
49
+ # mapper.
50
+ #
51
+ # @return [String]
52
+ def mapper_arg
53
+ args.first
54
+ end
55
+
56
+ # Does the mapper commandline need an explicit --run argument?
57
+ #
58
+ # Will not be used if the processor name is the same as the name
59
+ # of the script.
60
+ #
61
+ # @return [true, false]
62
+ def mapper_needs_run_arg?
63
+ return false if mapper_arg.to_s == mapper_name.to_s
64
+ return false if File.basename(mapper_arg.to_s, '.rb') == mapper_name.to_s
65
+ true
66
+ end
67
+
68
+ # Return the name of the processor to use as the mapper.
69
+ #
70
+ # Will raise a <tt>Wukong::Error</tt> if a given mapper is
71
+ # invalid or if none can be guessed.
72
+ #
73
+ # Most of the logic that examines explicit command line
74
+ # arguments and checks for the existence of named processors or
75
+ # files is here.
76
+ #
77
+ # @return [String]
78
+ def mapper_name
79
+ case
80
+ when explicit_mapper?
81
+ if processor_registered?(settings[:mapper])
82
+ settings[:mapper]
83
+ else
84
+ raise Error.new("No such processor: '#{settings[:mapper]}'")
85
+ end
86
+ when map_only? && processor_registered?(mapper_arg)
87
+ mapper_arg
88
+ when map_only? && file_is_processor?(mapper_arg)
89
+ processor_name_from_file(mapper_arg)
90
+ when single_job_arg? && explicit_reducer? && processor_registered?(mapper_arg)
91
+ mapper_arg
92
+ when separate_map_and_reduce_args? && processor_registered?(mapper_arg)
93
+ mapper_arg
94
+ when separate_map_and_reduce_args? && file_is_processor?(mapper_arg)
95
+ processor_name_from_file(mapper_arg)
96
+ when processor_registered?('mapper')
97
+ 'mapper'
98
+ else
99
+ raise Error.new("Could not find a processor to use as a mapper")
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,129 @@
1
+ module Wukong
2
+ module Hadoop
3
+
4
+ # Implements logic for figuring out the correct reducer
5
+ # commandline given wu-hadoop's arguments and whether or not to
6
+ # run a map-only (no-reduce) job.
7
+ module ReduceLogic
8
+
9
+ # Return the actual commandline used by the reducer, whether
10
+ # running in local or Hadoop mode.
11
+ #
12
+ # You should be able to copy, paste, and run this command
13
+ # unmodified to debug the reducer.
14
+ #
15
+ # @return [String]
16
+ def reducer_commandline
17
+ return '' unless reduce?
18
+ return settings[:reduce_command] if explicit_reduce_command?
19
+ [command_prefix, 'wu-local', reducer_arg].tap do |cmd|
20
+ cmd << "--run=#{reducer_name}" if reducer_needs_run_arg?
21
+ cmd << params_to_pass
22
+ end.compact.map(&:to_s).reject(&:empty?).join(' ')
23
+ end
24
+
25
+ # Were we given an explicit reduce command (like 'uniq -c') or
26
+ # are we to introspect and construct the command?
27
+ #
28
+ # @return [true, false]
29
+ def explicit_reduce_command?
30
+ settings[:reduce_command]
31
+ end
32
+
33
+ # Were we given a processor to use as our reducer explicitly by
34
+ # name or are we to introspect to discover the correct
35
+ # processor?
36
+ #
37
+ # @return [true, false]
38
+ def explicit_reduce_processor?
39
+ settings[:reducer]
40
+ end
41
+
42
+ # Were we given an explicit reducer (either as a command or as a
43
+ # processor) or should we introspect to find one?
44
+ #
45
+ # @return [true, false]
46
+ def explicit_reducer?
47
+ explicit_reduce_processor? || explicit_reduce_command?
48
+ end
49
+
50
+ # The argument that we should introspect on to turn into our
51
+ # reducer.
52
+ #
53
+ # @return [String]
54
+ def reducer_arg
55
+ args.last
56
+ end
57
+
58
+ # Should we perform a reduce or is this a map-only job?
59
+ #
60
+ # We will definitely reduce if
61
+ #
62
+ # - given an explicit <tt>--reduce_command</tt>
63
+ # - we discovered a reducer
64
+ #
65
+ # We will not reduce if:
66
+ #
67
+ # - <tt>--reduce_tasks</tt> was explicitly set to 0
68
+ #
69
+ # @return [true, false]
70
+ def reduce?
71
+ return false if settings[:reduce_tasks] && settings[:reduce_tasks].to_i == 0
72
+ return true if settings[:reduce_command]
73
+ return true if reducer_name
74
+ false
75
+ end
76
+
77
+ # Is this a map-only job?
78
+ #
79
+ # @see #reduce?
80
+ #
81
+ # @return [true, false]
82
+ def map_only?
83
+ (! reduce?)
84
+ end
85
+
86
+ # Does the reducer commandline need an explicit --run argument?
87
+ #
88
+ # Will not be used if the processor name is the same as the name
89
+ # of the script.
90
+ #
91
+ # @return [true, false]
92
+ def reducer_needs_run_arg?
93
+ return false if reducer_arg.to_s == reducer_name.to_s
94
+ return false if File.basename(reducer_arg.to_s, '.rb') == reducer_name
95
+ true
96
+ end
97
+
98
+ # Return the name of the processor to use as the reducer.
99
+ #
100
+ # Will raise a <tt>Wukong::Error</tt> if a given reducer is
101
+ # invalid. Will return nil if no reducer can be guessed.
102
+ #
103
+ # Most of the logic that examines explicit command line
104
+ # arguments and checks for the existence of named processors or
105
+ # files is here.
106
+ #
107
+ # @return [String]
108
+ def reducer_name
109
+ case
110
+ when explicit_reducer?
111
+ if processor_registered?(settings[:reducer])
112
+ settings[:reducer]
113
+ else
114
+ raise Error.new("No such processor: '#{settings[:reducer]}'")
115
+ end
116
+ when single_job_arg? && explicit_mapper? && processor_registered?(reducer_arg)
117
+ reducer_arg
118
+ when separate_map_and_reduce_args? && processor_registered?(reducer_arg)
119
+ reducer_arg
120
+ when separate_map_and_reduce_args? && file_is_processor?(reducer_arg)
121
+ processor_name_from_file(reducer_arg)
122
+ when processor_registered?('reducer')
123
+ 'reducer'
124
+ end
125
+ end
126
+
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,2 @@
1
+ require_relative("hadoop_env_methods")
2
+
@@ -0,0 +1,80 @@
1
+ module Wukong
2
+ module Hadoop
3
+
4
+ # Hadoop streaming exposes several environment variables to
5
+ # scripts it executes. This module contains methods that make
6
+ # these variables easily accessed from within a processor.
7
+ #
8
+ # Since these environment variables are ultimately set by Hadoop's
9
+ # streaming jar when executing inside Hadoop, you'll have to set
10
+ # them manually when testing locally.
11
+ #
12
+ # Via @pskomoroch via @tlipcon:
13
+ #
14
+ # "there is a little known Hadoop Streaming trick buried in this Python
15
+ # script. You will notice that the date is not actually in the raw log
16
+ # data itself, but is part of the filename. It turns out that Hadoop makes
17
+ # job parameters you would fetch in Java with something like
18
+ # job.get("mapred.input.file") available as environment variables for
19
+ # streaming jobs, with periods replaced with underscores:
20
+ #
21
+ # filepath = os.environ["map_input_file"]
22
+ # filename = os.path.split(filepath)[-1]
23
+ module EnvMethods
24
+
25
+ # Fetch a parameter set by Hadoop streaming in the environment
26
+ # of the currently executing process.
27
+ #
28
+ # @param [String] name the '.' separated parameter name to fetch
29
+ # @return [String] the value from the process' environment
30
+ def hadoop_streaming_parameter name
31
+ ENV[name.gsub('.', '_')]
32
+ end
33
+
34
+ # Path of the (data) file currently being processed.
35
+ #
36
+ # @return [String]
37
+ def input_file
38
+ ENV['map_input_file']
39
+ end
40
+
41
+ # Directory of the (data) file currently being processed.
42
+ #
43
+ # @return [String]
44
+ def input_dir
45
+ ENV['mapred_input_dir']
46
+ end
47
+
48
+ # Offset of the chunk currently being processed within the current input file.
49
+ #
50
+ # @return [String]
51
+ def map_input_start_offset
52
+ ENV['map_input_start']
53
+ end
54
+
55
+ # Length of the chunk currently being processed within the current input file.
56
+ #
57
+ # @return [String]
58
+ def map_input_length
59
+ ENV['map_input_length']
60
+ end
61
+
62
+ # ID of the current map/reduce attempt.
63
+ #
64
+ # @return [String]
65
+ def attempt_id
66
+ ENV['mapred_task_id']
67
+ end
68
+
69
+ # ID of the current map/reduce task.
70
+ #
71
+ # @return [String]
72
+ def curr_task_id
73
+ ENV['mapred_tip_id']
74
+ end
75
+
76
+ end
77
+ end
78
+
79
+ Processor.class_eval{ include Hadoop::EnvMethods }
80
+ end
@@ -0,0 +1,6 @@
1
+ module Wukong
2
+ module Hadoop
3
+ # The current version of Wukong-Hadoop.
4
+ VERSION = '0.0.1'
5
+ end
6
+ end
@@ -0,0 +1,21 @@
1
+ require 'wukong-hadoop'
2
+ require_relative('support/integration_helper')
3
+ require_relative('support/driver_helper')
4
+ require 'wukong/spec_helpers'
5
+
6
+ RSpec.configure do |config|
7
+
8
+ config.before(:each) do
9
+ @orig_reg = Wukong.registry.show
10
+ end
11
+
12
+ config.after(:each) do
13
+ Wukong.registry.clear!
14
+ Wukong.registry.merge!(@orig_reg)
15
+ end
16
+
17
+ include Wukong::SpecHelpers
18
+ include Wukong::Hadoop::IntegrationHelper
19
+ include Wukong::Hadoop::DriverHelper
20
+ end
21
+
@@ -0,0 +1,15 @@
1
+ module Wukong
2
+ module Hadoop
3
+ module DriverHelper
4
+
5
+ def driver *args
6
+ params = ::Wukong::Hadoop.configure(Configliere::Param.new)
7
+ params.resolve!
8
+ params.merge!(args.pop) if args.last.is_a?(Hash)
9
+ Wukong::Hadoop::Driver.new(params, *args)
10
+ end
11
+
12
+ end
13
+ end
14
+ end
15
+
@@ -0,0 +1,39 @@
1
+ module Wukong
2
+ module Hadoop
3
+ module IntegrationHelper
4
+
5
+ def root
6
+ @root ||= Pathname.new(File.expand_path('../../..', __FILE__))
7
+ end
8
+
9
+ def lib_dir
10
+ root.join('lib')
11
+ end
12
+
13
+ def bin_dir
14
+ root.join('bin')
15
+ end
16
+
17
+ def examples_dir
18
+ root.join('examples')
19
+ end
20
+
21
+ def integration_env
22
+ {
23
+ "PATH" => [bin_dir.to_s, ENV["PATH"]].compact.join(':'),
24
+ "RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
25
+ }
26
+ end
27
+
28
+ def integration_cwd
29
+ root.to_s
30
+ end
31
+
32
+ def example_script *args
33
+ examples_dir.join(*args)
34
+ end
35
+
36
+ end
37
+ end
38
+ end
39
+
@@ -0,0 +1,117 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Hadoop::Driver do
4
+
5
+ context "processing its arguments" do
6
+ it "raises an error when it can't find a file" do
7
+ lambda { driver(example_script('processors.rb'), example_script('doesnt_exist.rb')) }.should raise_error(Wukong::Error, /No such processor or file/)
8
+ end
9
+ it "raises an error when it can't find a widget" do
10
+ lambda { driver('regexp', 'doesnt_exist') }.should raise_error(Wukong::Error, /No such processor or file/)
11
+ end
12
+ it "raises an error when given more than two arguments" do
13
+ lambda { driver('regexp', example_script('counter.rb'), 'extra') }.should raise_error(Wukong::Error, /more than two/)
14
+ end
15
+ end
16
+
17
+ context "will execute a map-only job" do
18
+ context "with an explicit map command" do
19
+ let(:subject) { driver(:map_command => 'cut -f 1') }
20
+ its(:reduce?) { should be_false }
21
+ its(:mapper_commandline) { should match /^cut -f 1$/ }
22
+ end
23
+ context "with a single widget" do
24
+ let(:subject) { driver('regexp') }
25
+ its(:reduce?) { should be_false }
26
+ its(:mapper_commandline) { should match /^wu-local regexp$/ }
27
+ end
28
+ context "with a single file" do
29
+ context "defining a processor named 'mapper'" do
30
+ let(:subject) { driver(example_script('map_only.rb')) }
31
+ its(:reduce?) { should be_false }
32
+ its(:mapper_commandline) { should match /^wu-local .*map_only.rb --run=mapper$/ }
33
+ end
34
+ context "defining a processor named after the file" do
35
+ let(:subject) { driver(example_script('tokenizer.rb')) }
36
+ its(:reduce?) { should be_false }
37
+ its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
38
+ end
39
+ context "using the given --mapper option " do
40
+ let(:subject) { driver(example_script('processors.rb'), :mapper => 'tokenizer') }
41
+ its(:reduce?) { should be_false }
42
+ its(:mapper_commandline) { should match /^wu-local .*processors.rb --run=tokenizer$/ }
43
+ end
44
+ context "defining a processor named 'reducer' but with --reduce_tasks=0" do
45
+ let(:subject) { driver(example_script('word_count.rb'), :reduce_tasks => 0) }
46
+ its(:reduce?) { should be_false }
47
+ its(:mapper_commandline) { should match /^wu-local .*word_count.rb --run=mapper$/ }
48
+ end
49
+ end
50
+ context "with two files but with --reduce_tasks=0" do
51
+ let(:subject) { driver(example_script('tokenizer.rb'), example_script('counter.rb'), :reduce_tasks => 0) }
52
+ its(:reduce?) { should be_false }
53
+ its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
54
+ end
55
+ end
56
+
57
+ context "will execute a map-reduce job" do
58
+ context "with explicit map and reduce commands" do
59
+ let(:subject) { driver(:map_command => 'cut -f 1', :reduce_command => 'uniq -c') }
60
+ its(:reduce?) { should be_true }
61
+ its(:mapper_commandline) { should == 'cut -f 1' }
62
+ its(:reducer_commandline) { should == 'uniq -c' }
63
+ end
64
+ context "with two widgets" do
65
+ let(:subject) { driver('regexp', 'count') }
66
+ its(:reduce?) { should be_true }
67
+ its(:mapper_commandline) { should match /^wu-local regexp$/ }
68
+ its(:reducer_commandline) { should match /^wu-local count$/ }
69
+ end
70
+ context "with a single file" do
71
+ context "defining processors named 'mapper' and 'reducer'" do
72
+ let(:subject) { driver(example_script('word_count.rb')) }
73
+ its(:reduce?) { should be_true }
74
+ its(:mapper_commandline) { should match /^wu-local .*word_count.rb --run=mapper$/ }
75
+ its(:reducer_commandline) { should match /^wu-local .*word_count.rb --run=reducer$/ }
76
+ end
77
+ end
78
+ context "with two files" do
79
+ let(:subject) { driver(example_script('tokenizer.rb'), example_script('counter.rb')) }
80
+ its(:reduce?) { should be_true }
81
+ its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
82
+ its(:reducer_commandline) { should match /^wu-local .*counter.rb$/ }
83
+ end
84
+ context "with a widget and a file" do
85
+ let(:subject) { driver('regexp', example_script('counter.rb')) }
86
+ its(:reduce?) { should be_true }
87
+ its(:mapper_commandline) { should match /^wu-local regexp$/ }
88
+ its(:reducer_commandline) { should match /^wu-local .*counter.rb$/ }
89
+ end
90
+ context "with a file and a widget" do
91
+ let(:subject) { driver(example_script('tokenizer.rb'), 'count') }
92
+ its(:reduce?) { should be_true }
93
+ its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
94
+ its(:reducer_commandline) { should match /^wu-local count$/ }
95
+ end
96
+ end
97
+
98
+ context "handling arguments" do
99
+ let(:subject) { driver('regexp', :clean => 'hi', :messy => 'hi "there"', :reduce_tasks => 0, :dry_run => true, :rm => true) }
100
+ it "passes arguments it doesn't know about to wu-local" do
101
+ subject.mapper_commandline.should include('--clean=hi')
102
+ end
103
+ it "correctly passes messy arguments" do
104
+ subject.mapper_commandline.should include('--messy=hi\\ \\"there\\"')
105
+ end
106
+ it "does not pass arguments that are internal to wukong-hadoop" do
107
+ subject.mapper_commandline.should_not include('--reduce_tasks', '--dry_run', '--rm')
108
+ end
109
+ end
110
+
111
+ context "given the --command_prefix option" do
112
+ let(:subject) { driver('regexp', 'count', :command_prefix => 'bundle exec') }
113
+ its(:mapper_commandline) { should match(/^bundle exec wu-local/) }
114
+ its(:reducer_commandline) { should match(/^bundle exec wu-local/) }
115
+ end
116
+
117
+ end