wukong-hadoop 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/.gitignore +59 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +3 -0
  4. data/README.md +339 -0
  5. data/Rakefile +13 -0
  6. data/bin/hdp-bin +44 -0
  7. data/bin/hdp-bzip +23 -0
  8. data/bin/hdp-cat +3 -0
  9. data/bin/hdp-catd +3 -0
  10. data/bin/hdp-cp +3 -0
  11. data/bin/hdp-du +86 -0
  12. data/bin/hdp-get +3 -0
  13. data/bin/hdp-kill +3 -0
  14. data/bin/hdp-kill-task +3 -0
  15. data/bin/hdp-ls +11 -0
  16. data/bin/hdp-mkdir +2 -0
  17. data/bin/hdp-mkdirp +12 -0
  18. data/bin/hdp-mv +3 -0
  19. data/bin/hdp-parts_to_keys.rb +77 -0
  20. data/bin/hdp-ps +3 -0
  21. data/bin/hdp-put +3 -0
  22. data/bin/hdp-rm +32 -0
  23. data/bin/hdp-sort +40 -0
  24. data/bin/hdp-stream +40 -0
  25. data/bin/hdp-stream-flat +22 -0
  26. data/bin/hdp-stream2 +39 -0
  27. data/bin/hdp-sync +17 -0
  28. data/bin/hdp-wc +67 -0
  29. data/bin/wu-hadoop +14 -0
  30. data/examples/counter.rb +17 -0
  31. data/examples/map_only.rb +28 -0
  32. data/examples/processors.rb +4 -0
  33. data/examples/sonnet_18.txt +14 -0
  34. data/examples/tokenizer.rb +28 -0
  35. data/examples/word_count.rb +44 -0
  36. data/features/step_definitions/wu_hadoop_steps.rb +4 -0
  37. data/features/support/env.rb +1 -0
  38. data/features/wu_hadoop.feature +113 -0
  39. data/lib/wukong-hadoop.rb +21 -0
  40. data/lib/wukong-hadoop/configuration.rb +133 -0
  41. data/lib/wukong-hadoop/driver.rb +190 -0
  42. data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
  43. data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
  44. data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
  45. data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
  46. data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
  47. data/lib/wukong-hadoop/extensions.rb +2 -0
  48. data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
  49. data/lib/wukong-hadoop/version.rb +6 -0
  50. data/spec/spec_helper.rb +21 -0
  51. data/spec/support/driver_helper.rb +15 -0
  52. data/spec/support/integration_helper.rb +39 -0
  53. data/spec/wukong-hadoop/driver_spec.rb +117 -0
  54. data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
  55. data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
  56. data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
  57. data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
  58. data/wukong-hadoop.gemspec +33 -0
  59. metadata +168 -0
@@ -0,0 +1,104 @@
1
+ module Wukong
2
+ module Hadoop
3
+
4
+ # Implements logic for figuring out the correct mapper commandline
5
+ # given wu-hadoop's arguments.
6
+ module MapLogic
7
+
8
+ # Return the actual commandline used by the mapper, whether
9
+ # running in local or Hadoop mode.
10
+ #
11
+ # You should be able to copy, paste, and run this command
12
+ # unmodified to debug the mapper.
13
+ #
14
+ # @return [String]
15
+ def mapper_commandline
16
+ return settings[:map_command] if explicit_map_command?
17
+ [command_prefix, 'wu-local', mapper_arg].tap do |cmd|
18
+ cmd << "--run=#{mapper_name}" if mapper_needs_run_arg?
19
+ cmd << params_to_pass
20
+ end.compact.map(&:to_s).reject(&:empty?).join(' ')
21
+ end
22
+
23
+ # Were we given an explicit map command (like 'cut -f 1') or are
24
+ # we to introspect and construct the command?
25
+ #
26
+ # @return [true, false]
27
+ def explicit_map_command?
28
+ settings[:map_command]
29
+ end
30
+
31
+ # Were we given a processor to use as our mapper explicitly by
32
+ # name or are we to introspect to discover the correct
33
+ # processor?
34
+ #
35
+ # @return [true, false]
36
+ def explicit_map_processor?
37
+ settings[:mapper]
38
+ end
39
+
40
+ # Were we given an explicit mapper (either as a command or as a
41
+ # processor) or should we introspect to find one?
42
+ #
43
+ # @return [true, false]
44
+ def explicit_mapper?
45
+ explicit_map_processor? || explicit_map_command?
46
+ end
47
+
48
+ # The argument that we should introspect on to turn into our
49
+ # mapper.
50
+ #
51
+ # @return [String]
52
+ def mapper_arg
53
+ args.first
54
+ end
55
+
56
+ # Does the mapper commandline need an explicit --run argument?
57
+ #
58
+ # Will not be used if the processor name is the same as the name
59
+ # of the script.
60
+ #
61
+ # @return [true, false]
62
+ def mapper_needs_run_arg?
63
+ return false if mapper_arg.to_s == mapper_name.to_s
64
+ return false if File.basename(mapper_arg.to_s, '.rb') == mapper_name.to_s
65
+ true
66
+ end
67
+
68
+ # Return the name of the processor to use as the mapper.
69
+ #
70
+ # Will raise a <tt>Wukong::Error</tt> if a given mapper is
71
+ # invalid or if none can be guessed.
72
+ #
73
+ # Most of the logic that examines explicit command line
74
+ # arguments and checks for the existence of named processors or
75
+ # files is here.
76
+ #
77
+ # @return [String]
78
+ def mapper_name
79
+ case
80
+ when explicit_mapper?
81
+ if processor_registered?(settings[:mapper])
82
+ settings[:mapper]
83
+ else
84
+ raise Error.new("No such processor: '#{settings[:mapper]}'")
85
+ end
86
+ when map_only? && processor_registered?(mapper_arg)
87
+ mapper_arg
88
+ when map_only? && file_is_processor?(mapper_arg)
89
+ processor_name_from_file(mapper_arg)
90
+ when single_job_arg? && explicit_reducer? && processor_registered?(mapper_arg)
91
+ mapper_arg
92
+ when separate_map_and_reduce_args? && processor_registered?(mapper_arg)
93
+ mapper_arg
94
+ when separate_map_and_reduce_args? && file_is_processor?(mapper_arg)
95
+ processor_name_from_file(mapper_arg)
96
+ when processor_registered?('mapper')
97
+ 'mapper'
98
+ else
99
+ raise Error.new("Could not find a processor to use as a mapper")
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,129 @@
1
+ module Wukong
2
+ module Hadoop
3
+
4
+ # Implements logic for figuring out the correct reducer
5
+ # commandline given wu-hadoop's arguments and whether or not to
6
+ # run a map-only (no-reduce) job.
7
+ module ReduceLogic
8
+
9
+ # Return the actual commandline used by the reducer, whether
10
+ # running in local or Hadoop mode.
11
+ #
12
+ # You should be able to copy, paste, and run this command
13
+ # unmodified to debug the reducer.
14
+ #
15
+ # @return [String]
16
+ def reducer_commandline
17
+ return '' unless reduce?
18
+ return settings[:reduce_command] if explicit_reduce_command?
19
+ [command_prefix, 'wu-local', reducer_arg].tap do |cmd|
20
+ cmd << "--run=#{reducer_name}" if reducer_needs_run_arg?
21
+ cmd << params_to_pass
22
+ end.compact.map(&:to_s).reject(&:empty?).join(' ')
23
+ end
24
+
25
+ # Were we given an explicit reduce command (like 'uniq -c') or
26
+ # are we to introspect and construct the command?
27
+ #
28
+ # @return [true, false]
29
+ def explicit_reduce_command?
30
+ settings[:reduce_command]
31
+ end
32
+
33
+ # Were we given a processor to use as our reducer explicitly by
34
+ # name or are we to introspect to discover the correct
35
+ # processor?
36
+ #
37
+ # @return [true, false]
38
+ def explicit_reduce_processor?
39
+ settings[:reducer]
40
+ end
41
+
42
+ # Were we given an explicit reducer (either as a command or as a
43
+ # processor) or should we introspect to find one?
44
+ #
45
+ # @return [true, false]
46
+ def explicit_reducer?
47
+ explicit_reduce_processor? || explicit_reduce_command?
48
+ end
49
+
50
+ # The argument that we should introspect on to turn into our
51
+ # reducer.
52
+ #
53
+ # @return [String]
54
+ def reducer_arg
55
+ args.last
56
+ end
57
+
58
+ # Should we perform a reduce or is this a map-only job?
59
+ #
60
+ # We will definitely reduce if
61
+ #
62
+ # - given an explicit <tt>--reduce_command</tt>
63
+ # - we discovered a reducer
64
+ #
65
+ # We will not reduce if:
66
+ #
67
+ # - <tt>--reduce_tasks</tt> was explicitly set to 0
68
+ #
69
+ # @return [true, false]
70
+ def reduce?
71
+ return false if settings[:reduce_tasks] && settings[:reduce_tasks].to_i == 0
72
+ return true if settings[:reduce_command]
73
+ return true if reducer_name
74
+ false
75
+ end
76
+
77
+ # Is this a map-only job?
78
+ #
79
+ # @see #reduce?
80
+ #
81
+ # @return [true, false]
82
+ def map_only?
83
+ (! reduce?)
84
+ end
85
+
86
+ # Does the reducer commandline need an explicit --run argument?
87
+ #
88
+ # Will not be used if the processor name is the same as the name
89
+ # of the script.
90
+ #
91
+ # @return [true, false]
92
+ def reducer_needs_run_arg?
93
+ return false if reducer_arg.to_s == reducer_name.to_s
94
+ return false if File.basename(reducer_arg.to_s, '.rb') == reducer_name
95
+ true
96
+ end
97
+
98
+ # Return the name of the processor to use as the reducer.
99
+ #
100
+ # Will raise a <tt>Wukong::Error</tt> if a given reducer is
101
+ # invalid. Will return nil if no reducer can be guessed.
102
+ #
103
+ # Most of the logic that examines explicit command line
104
+ # arguments and checks for the existence of named processors or
105
+ # files is here.
106
+ #
107
+ # @return [String]
108
+ def reducer_name
109
+ case
110
+ when explicit_reducer?
111
+ if processor_registered?(settings[:reducer])
112
+ settings[:reducer]
113
+ else
114
+ raise Error.new("No such processor: '#{settings[:reducer]}'")
115
+ end
116
+ when single_job_arg? && explicit_mapper? && processor_registered?(reducer_arg)
117
+ reducer_arg
118
+ when separate_map_and_reduce_args? && processor_registered?(reducer_arg)
119
+ reducer_arg
120
+ when separate_map_and_reduce_args? && file_is_processor?(reducer_arg)
121
+ processor_name_from_file(reducer_arg)
122
+ when processor_registered?('reducer')
123
+ 'reducer'
124
+ end
125
+ end
126
+
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,2 @@
1
+ require_relative("hadoop_env_methods")
2
+
@@ -0,0 +1,80 @@
1
+ module Wukong
2
+ module Hadoop
3
+
4
+ # Hadoop streaming exposes several environment variables to
5
+ # scripts it executes. This module contains methods that make
6
+ # these variables easily accessed from within a processor.
7
+ #
8
+ # Since these environment variables are ultimately set by Hadoop's
9
+ # streaming jar when executing inside Hadoop, you'll have to set
10
+ # them manually when testing locally.
11
+ #
12
+ # Via @pskomoroch via @tlipcon:
13
+ #
14
+ # "there is a little known Hadoop Streaming trick buried in this Python
15
+ # script. You will notice that the date is not actually in the raw log
16
+ # data itself, but is part of the filename. It turns out that Hadoop makes
17
+ # job parameters you would fetch in Java with something like
18
+ # job.get("mapred.input.file") available as environment variables for
19
+ # streaming jobs, with periods replaced with underscores:
20
+ #
21
+ # filepath = os.environ["map_input_file"]
22
+ # filename = os.path.split(filepath)[-1]
23
+ module EnvMethods
24
+
25
+ # Fetch a parameter set by Hadoop streaming in the environment
26
+ # of the currently executing process.
27
+ #
28
+ # @param [String] name the '.' separated parameter name to fetch
29
+ # @return [String] the value from the process' environment
30
+ def hadoop_streaming_parameter name
31
+ ENV[name.gsub('.', '_')]
32
+ end
33
+
34
+ # Path of the (data) file currently being processed.
35
+ #
36
+ # @return [String]
37
+ def input_file
38
+ ENV['map_input_file']
39
+ end
40
+
41
+ # Directory of the (data) file currently being processed.
42
+ #
43
+ # @return [String]
44
+ def input_dir
45
+ ENV['mapred_input_dir']
46
+ end
47
+
48
+ # Offset of the chunk currently being processed within the current input file.
49
+ #
50
+ # @return [String]
51
+ def map_input_start_offset
52
+ ENV['map_input_start']
53
+ end
54
+
55
+ # Length of the chunk currently being processed within the current input file.
56
+ #
57
+ # @return [String]
58
+ def map_input_length
59
+ ENV['map_input_length']
60
+ end
61
+
62
+ # ID of the current map/reduce attempt.
63
+ #
64
+ # @return [String]
65
+ def attempt_id
66
+ ENV['mapred_task_id']
67
+ end
68
+
69
+ # ID of the current map/reduce task.
70
+ #
71
+ # @return [String]
72
+ def curr_task_id
73
+ ENV['mapred_tip_id']
74
+ end
75
+
76
+ end
77
+ end
78
+
79
+ Processor.class_eval{ include Hadoop::EnvMethods }
80
+ end
@@ -0,0 +1,6 @@
1
+ module Wukong
2
+ module Hadoop
3
+ # The current version of Wukong-Hadoop.
4
+ VERSION = '0.0.1'
5
+ end
6
+ end
@@ -0,0 +1,21 @@
1
+ require 'wukong-hadoop'
2
+ require_relative('support/integration_helper')
3
+ require_relative('support/driver_helper')
4
+ require 'wukong/spec_helpers'
5
+
6
+ RSpec.configure do |config|
7
+
8
+ config.before(:each) do
9
+ @orig_reg = Wukong.registry.show
10
+ end
11
+
12
+ config.after(:each) do
13
+ Wukong.registry.clear!
14
+ Wukong.registry.merge!(@orig_reg)
15
+ end
16
+
17
+ include Wukong::SpecHelpers
18
+ include Wukong::Hadoop::IntegrationHelper
19
+ include Wukong::Hadoop::DriverHelper
20
+ end
21
+
@@ -0,0 +1,15 @@
1
+ module Wukong
2
+ module Hadoop
3
+ module DriverHelper
4
+
5
+ def driver *args
6
+ params = ::Wukong::Hadoop.configure(Configliere::Param.new)
7
+ params.resolve!
8
+ params.merge!(args.pop) if args.last.is_a?(Hash)
9
+ Wukong::Hadoop::Driver.new(params, *args)
10
+ end
11
+
12
+ end
13
+ end
14
+ end
15
+
@@ -0,0 +1,39 @@
1
+ module Wukong
2
+ module Hadoop
3
+ module IntegrationHelper
4
+
5
+ def root
6
+ @root ||= Pathname.new(File.expand_path('../../..', __FILE__))
7
+ end
8
+
9
+ def lib_dir
10
+ root.join('lib')
11
+ end
12
+
13
+ def bin_dir
14
+ root.join('bin')
15
+ end
16
+
17
+ def examples_dir
18
+ root.join('examples')
19
+ end
20
+
21
+ def integration_env
22
+ {
23
+ "PATH" => [bin_dir.to_s, ENV["PATH"]].compact.join(':'),
24
+ "RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
25
+ }
26
+ end
27
+
28
+ def integration_cwd
29
+ root.to_s
30
+ end
31
+
32
+ def example_script *args
33
+ examples_dir.join(*args)
34
+ end
35
+
36
+ end
37
+ end
38
+ end
39
+
@@ -0,0 +1,117 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Hadoop::Driver do
4
+
5
+ context "processing its arguments" do
6
+ it "raises an error when it can't find a file" do
7
+ lambda { driver(example_script('processors.rb'), example_script('doesnt_exist.rb')) }.should raise_error(Wukong::Error, /No such processor or file/)
8
+ end
9
+ it "raises an error when it can't find a widget" do
10
+ lambda { driver('regexp', 'doesnt_exist') }.should raise_error(Wukong::Error, /No such processor or file/)
11
+ end
12
+ it "raises an error when given more than two arguments" do
13
+ lambda { driver('regexp', example_script('counter.rb'), 'extra') }.should raise_error(Wukong::Error, /more than two/)
14
+ end
15
+ end
16
+
17
+ context "will execute a map-only job" do
18
+ context "with an explicit map command" do
19
+ let(:subject) { driver(:map_command => 'cut -f 1') }
20
+ its(:reduce?) { should be_false }
21
+ its(:mapper_commandline) { should match /^cut -f 1$/ }
22
+ end
23
+ context "with a single widget" do
24
+ let(:subject) { driver('regexp') }
25
+ its(:reduce?) { should be_false }
26
+ its(:mapper_commandline) { should match /^wu-local regexp$/ }
27
+ end
28
+ context "with a single file" do
29
+ context "defining a processor named 'mapper'" do
30
+ let(:subject) { driver(example_script('map_only.rb')) }
31
+ its(:reduce?) { should be_false }
32
+ its(:mapper_commandline) { should match /^wu-local .*map_only.rb --run=mapper$/ }
33
+ end
34
+ context "defining a processor named after the file" do
35
+ let(:subject) { driver(example_script('tokenizer.rb')) }
36
+ its(:reduce?) { should be_false }
37
+ its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
38
+ end
39
+ context "using the given --mapper option " do
40
+ let(:subject) { driver(example_script('processors.rb'), :mapper => 'tokenizer') }
41
+ its(:reduce?) { should be_false }
42
+ its(:mapper_commandline) { should match /^wu-local .*processors.rb --run=tokenizer$/ }
43
+ end
44
+ context "defining a processor named 'reducer' but with --reduce_tasks=0" do
45
+ let(:subject) { driver(example_script('word_count.rb'), :reduce_tasks => 0) }
46
+ its(:reduce?) { should be_false }
47
+ its(:mapper_commandline) { should match /^wu-local .*word_count.rb --run=mapper$/ }
48
+ end
49
+ end
50
+ context "with two files but with --reduce_tasks=0" do
51
+ let(:subject) { driver(example_script('tokenizer.rb'), example_script('counter.rb'), :reduce_tasks => 0) }
52
+ its(:reduce?) { should be_false }
53
+ its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
54
+ end
55
+ end
56
+
57
+ context "will execute a map-reduce job" do
58
+ context "with explicit map and reduce commands" do
59
+ let(:subject) { driver(:map_command => 'cut -f 1', :reduce_command => 'uniq -c') }
60
+ its(:reduce?) { should be_true }
61
+ its(:mapper_commandline) { should == 'cut -f 1' }
62
+ its(:reducer_commandline) { should == 'uniq -c' }
63
+ end
64
+ context "with two widgets" do
65
+ let(:subject) { driver('regexp', 'count') }
66
+ its(:reduce?) { should be_true }
67
+ its(:mapper_commandline) { should match /^wu-local regexp$/ }
68
+ its(:reducer_commandline) { should match /^wu-local count$/ }
69
+ end
70
+ context "with a single file" do
71
+ context "defining processors named 'mapper' and 'reducer'" do
72
+ let(:subject) { driver(example_script('word_count.rb')) }
73
+ its(:reduce?) { should be_true }
74
+ its(:mapper_commandline) { should match /^wu-local .*word_count.rb --run=mapper$/ }
75
+ its(:reducer_commandline) { should match /^wu-local .*word_count.rb --run=reducer$/ }
76
+ end
77
+ end
78
+ context "with two files" do
79
+ let(:subject) { driver(example_script('tokenizer.rb'), example_script('counter.rb')) }
80
+ its(:reduce?) { should be_true }
81
+ its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
82
+ its(:reducer_commandline) { should match /^wu-local .*counter.rb$/ }
83
+ end
84
+ context "with a widget and a file" do
85
+ let(:subject) { driver('regexp', example_script('counter.rb')) }
86
+ its(:reduce?) { should be_true }
87
+ its(:mapper_commandline) { should match /^wu-local regexp$/ }
88
+ its(:reducer_commandline) { should match /^wu-local .*counter.rb$/ }
89
+ end
90
+ context "with a file and a widget" do
91
+ let(:subject) { driver(example_script('tokenizer.rb'), 'count') }
92
+ its(:reduce?) { should be_true }
93
+ its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
94
+ its(:reducer_commandline) { should match /^wu-local count$/ }
95
+ end
96
+ end
97
+
98
+ context "handling arguments" do
99
+ let(:subject) { driver('regexp', :clean => 'hi', :messy => 'hi "there"', :reduce_tasks => 0, :dry_run => true, :rm => true) }
100
+ it "passes arguments it doesn't know about to wu-local" do
101
+ subject.mapper_commandline.should include('--clean=hi')
102
+ end
103
+ it "correctly passes messy arguments" do
104
+ subject.mapper_commandline.should include('--messy=hi\\ \\"there\\"')
105
+ end
106
+ it "does not pass arguments that are internal to wukong-hadoop" do
107
+ subject.mapper_commandline.should_not include('--reduce_tasks', '--dry_run', '--rm')
108
+ end
109
+ end
110
+
111
+ context "given the --command_prefix option" do
112
+ let(:subject) { driver('regexp', 'count', :command_prefix => 'bundle exec') }
113
+ its(:mapper_commandline) { should match(/^bundle exec wu-local/) }
114
+ its(:reducer_commandline) { should match(/^bundle exec wu-local/) }
115
+ end
116
+
117
+ end