wukong-hadoop 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/.gitignore +59 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +3 -0
  4. data/README.md +339 -0
  5. data/Rakefile +13 -0
  6. data/bin/hdp-bin +44 -0
  7. data/bin/hdp-bzip +23 -0
  8. data/bin/hdp-cat +3 -0
  9. data/bin/hdp-catd +3 -0
  10. data/bin/hdp-cp +3 -0
  11. data/bin/hdp-du +86 -0
  12. data/bin/hdp-get +3 -0
  13. data/bin/hdp-kill +3 -0
  14. data/bin/hdp-kill-task +3 -0
  15. data/bin/hdp-ls +11 -0
  16. data/bin/hdp-mkdir +2 -0
  17. data/bin/hdp-mkdirp +12 -0
  18. data/bin/hdp-mv +3 -0
  19. data/bin/hdp-parts_to_keys.rb +77 -0
  20. data/bin/hdp-ps +3 -0
  21. data/bin/hdp-put +3 -0
  22. data/bin/hdp-rm +32 -0
  23. data/bin/hdp-sort +40 -0
  24. data/bin/hdp-stream +40 -0
  25. data/bin/hdp-stream-flat +22 -0
  26. data/bin/hdp-stream2 +39 -0
  27. data/bin/hdp-sync +17 -0
  28. data/bin/hdp-wc +67 -0
  29. data/bin/wu-hadoop +14 -0
  30. data/examples/counter.rb +17 -0
  31. data/examples/map_only.rb +28 -0
  32. data/examples/processors.rb +4 -0
  33. data/examples/sonnet_18.txt +14 -0
  34. data/examples/tokenizer.rb +28 -0
  35. data/examples/word_count.rb +44 -0
  36. data/features/step_definitions/wu_hadoop_steps.rb +4 -0
  37. data/features/support/env.rb +1 -0
  38. data/features/wu_hadoop.feature +113 -0
  39. data/lib/wukong-hadoop.rb +21 -0
  40. data/lib/wukong-hadoop/configuration.rb +133 -0
  41. data/lib/wukong-hadoop/driver.rb +190 -0
  42. data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
  43. data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
  44. data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
  45. data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
  46. data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
  47. data/lib/wukong-hadoop/extensions.rb +2 -0
  48. data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
  49. data/lib/wukong-hadoop/version.rb +6 -0
  50. data/spec/spec_helper.rb +21 -0
  51. data/spec/support/driver_helper.rb +15 -0
  52. data/spec/support/integration_helper.rb +39 -0
  53. data/spec/wukong-hadoop/driver_spec.rb +117 -0
  54. data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
  55. data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
  56. data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
  57. data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
  58. data/wukong-hadoop.gemspec +33 -0
  59. metadata +168 -0
@@ -0,0 +1,14 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Hadoop::EnvMethods do
4
+
5
+ subject{ Wukong::Processor.new }
6
+
7
+ it{ should respond_to(:input_file) }
8
+ it{ should respond_to(:input_dir) }
9
+ it{ should respond_to(:map_input_start_offset) }
10
+ it{ should respond_to(:map_input_length) }
11
+ it{ should respond_to(:attempt_id) }
12
+ it{ should respond_to(:curr_task_id) }
13
+
14
+ end
@@ -0,0 +1,78 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Hadoop::HadoopInvocation do
4
+
5
+ let(:map_only) { driver('regexp', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
6
+ let(:map_reduce) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
7
+ let(:complex) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye']) }
8
+ let(:custum_io) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
9
+
10
+ context "defining input paths" do
11
+ it "raises an error unless given an --input option" do
12
+ lambda { driver('regexp', output: '/tmp/output').run! }.should raise_error(Wukong::Error, /--input.*required/)
13
+ end
14
+ it "sets its input paths correctly" do
15
+ map_reduce.hadoop_commandline.should match(%r{-input\s+'/tmp/input1,/tmp/input2'})
16
+ end
17
+ it "sets its input format given the --input_format option" do
18
+ custum_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
19
+ end
20
+ end
21
+
22
+ context "defining its output path" do
23
+ it "raises an error unless given an --output option" do
24
+ lambda { driver('regexp', input: '/tmp/output').run! }.should raise_error(Wukong::Error, /--output.*required/)
25
+ end
26
+ it "sets its output path correctly" do
27
+ map_reduce.hadoop_commandline.should match(%r{-output\s+'/tmp/output'})
28
+ end
29
+ it "sets its output format given the --output_format option" do
30
+ custum_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
31
+ end
32
+ end
33
+
34
+ context "defining its mapper and reducer" do
35
+ it "sets its mapper correctly" do
36
+ map_reduce.hadoop_commandline.should match(%r{-mapper\s+'wu-local regexp'})
37
+ end
38
+ it "sets its reducer correctly" do
39
+ map_reduce.hadoop_commandline.should match(%r{-reducer\s+'wu-local count'})
40
+ end
41
+ it "uses a blank reducer for a map-only job" do
42
+ map_only.hadoop_commandline.should match(%r{-reducer\s+''})
43
+ end
44
+ end
45
+
46
+ context "defining Hadoop JobConf options" do
47
+ it "translates friendly names into native ones" do
48
+ complex.hadoop_commandline.should include("-D mapred.job.name='testy'")
49
+ complex.hadoop_commandline.should include("-D mapred.map.tasks=100")
50
+ end
51
+ it "passes options in the given --java_opts option" do
52
+ complex.hadoop_commandline.should include('-D foo.bar=3','-D baz.booz=hello','-D hi.there=bye')
53
+ end
54
+ end
55
+
56
+ context "removing existing output paths" do
57
+ before { Log.stub!(:info) }
58
+ it "will not remove the output path by default" do
59
+ map_reduce.should_not_receive(:remove_output_path!)
60
+ map_reduce.should_receive(:execute_command!)
61
+ map_reduce.run!
62
+ end
63
+ it "will remove the output path when given the --rm option" do
64
+ d = driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true)
65
+ d.should_receive(:remove_output_path!)
66
+ d.should_receive(:execute_command!)
67
+ d.run!
68
+ end
69
+ it "will not remove the output path when given the --rm option AND the --dry_run option" do
70
+ d = driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true)
71
+ d.should_receive(:remove_output_path!)
72
+ d.should_receive(:execute_command!)
73
+ d.run!
74
+ end
75
+
76
+ end
77
+
78
+ end
@@ -0,0 +1,22 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Hadoop::LocalInvocation do
4
+ it "reads from STDIN and writes to STDOUT by default" do
5
+ driver('regexp').local_commandline.should == 'wu-local regexp'
6
+ end
7
+ it "reads from from multiple input paths given the --input option" do
8
+ driver('regexp', :input => '/some/file.tsv,something_else.dat').local_commandline.should == 'cat /some/file.tsv something_else.dat | wu-local regexp'
9
+ end
10
+ it "writes to a file given the --output option" do
11
+ driver('regexp', :output => '/tmp/output.json').local_commandline.should == 'wu-local regexp > /tmp/output.json'
12
+ end
13
+ it "will not perform a sort on a map-only job" do
14
+ driver('regexp').local_commandline.should_not include('sort')
15
+ end
16
+ it "will perform a sort on a map-reduce job" do
17
+ driver('regexp', 'count').local_commandline.should == 'wu-local regexp | sort | wu-local count'
18
+ end
19
+ it "will accept a custom sort command" do
20
+ driver('regexp', 'count', :sort_command => 'sort -n').local_commandline.should == 'wu-local regexp | sort -n | wu-local count'
21
+ end
22
+ end
@@ -0,0 +1,34 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'wu-hadoop' do
4
+ context "without any arguments" do
5
+ let(:subject) { command('wu-hadoop') }
6
+ it {should exit_with(:non_zero) }
7
+ it "displays help on STDERR" do
8
+ should have_stderr("usage: wu-hadoop")
9
+ end
10
+ end
11
+
12
+ context "in local mode" do
13
+ context "on a map-only job" do
14
+ let(:subject) { command('wu-hadoop', example_script('tokenizer.rb'), "--mode=local", "--input=#{example_script('sonnet_18.txt')}") }
15
+ it { should exit_with(0) }
16
+ it { should have_stdout('Shall', 'I', 'compare', 'thee', 'to', 'a', "summer's", 'day') }
17
+ end
18
+
19
+ context "on a map-reduce job" do
20
+ let(:subject) { command('wu-hadoop', example_script('word_count.rb'), "--mode=local", "--input=#{example_script('sonnet_18.txt')}") }
21
+ it { should exit_with(0) }
22
+ it { should have_stdout(/complexion\s+1/, /Death\s+1/, /temperate\s+1/) }
23
+ end
24
+ end
25
+
26
+ context "in Hadoop mode" do
27
+ context "on a map-only job" do
28
+ let(:subject) { command('wu-hadoop', example_script('tokenizer.rb'), "--mode=hadoop", "--input=/data/in", "--output=/data/out", "--dry_run") }
29
+ it { should exit_with(0) }
30
+ it { should have_stdout(%r{jar.*hadoop.*streaming.*\.jar}, %r{-mapper.+tokenizer\.rb}, %r{-input.*/data/in}, %r{-output.*/data/out}) }
31
+ end
32
+ end
33
+
34
+ end
@@ -0,0 +1,33 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/wukong-hadoop/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = 'wukong-hadoop'
6
+ gem.homepage = 'https://github.com/infochimps-labs/wukong-hadoop'
7
+ gem.licenses = ["Apache 2.0"]
8
+ gem.email = 'coders@infochimps.org'
9
+ gem.authors = ['Infochimps', 'Philip (flip) Kromer', 'Travis Dempsey']
10
+ gem.version = Wukong::Hadoop::VERSION
11
+
12
+ gem.summary = 'Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.'
13
+ gem.description = <<-EOF
14
+ Treat your dataset like a:
15
+
16
+ * stream of lines when it's efficient to process by lines
17
+ * stream of field arrays when it's efficient to deal directly with fields
18
+ * stream of lightweight objects when it's efficient to deal with objects
19
+
20
+ Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.
21
+ EOF
22
+
23
+ gem.files = `git ls-files`.split("\n")
24
+ gem.executables = ['wu-hadoop']
25
+ gem.test_files = gem.files.grep(/^spec/)
26
+ gem.require_paths = ['lib']
27
+
28
+ gem.add_dependency('wukong', '3.0.0.pre2')
29
+
30
+ gem.add_development_dependency 'rake', '~> 0.9'
31
+ gem.add_development_dependency 'rspec', '~> 2'
32
+
33
+ end
metadata ADDED
@@ -0,0 +1,168 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wukong-hadoop
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Infochimps
9
+ - Philip (flip) Kromer
10
+ - Travis Dempsey
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2012-12-01 00:00:00.000000000 Z
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: wukong
18
+ requirement: !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - '='
22
+ - !ruby/object:Gem::Version
23
+ version: 3.0.0.pre2
24
+ type: :runtime
25
+ prerelease: false
26
+ version_requirements: !ruby/object:Gem::Requirement
27
+ none: false
28
+ requirements:
29
+ - - '='
30
+ - !ruby/object:Gem::Version
31
+ version: 3.0.0.pre2
32
+ - !ruby/object:Gem::Dependency
33
+ name: rake
34
+ requirement: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ~>
38
+ - !ruby/object:Gem::Version
39
+ version: '0.9'
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ none: false
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '0.9'
48
+ - !ruby/object:Gem::Dependency
49
+ name: rspec
50
+ requirement: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ~>
54
+ - !ruby/object:Gem::Version
55
+ version: '2'
56
+ type: :development
57
+ prerelease: false
58
+ version_requirements: !ruby/object:Gem::Requirement
59
+ none: false
60
+ requirements:
61
+ - - ~>
62
+ - !ruby/object:Gem::Version
63
+ version: '2'
64
+ description: ! " Treat your dataset like a:\n\n * stream of lines when it's
65
+ efficient to process by lines\n * stream of field arrays when it's efficient
66
+ to deal directly with fields\n * stream of lightweight objects when it's efficient
67
+ to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query
68
+ language, and the cat on your command line.\n"
69
+ email: coders@infochimps.org
70
+ executables:
71
+ - wu-hadoop
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - .gitignore
76
+ - .rspec
77
+ - Gemfile
78
+ - README.md
79
+ - Rakefile
80
+ - bin/hdp-bin
81
+ - bin/hdp-bzip
82
+ - bin/hdp-cat
83
+ - bin/hdp-catd
84
+ - bin/hdp-cp
85
+ - bin/hdp-du
86
+ - bin/hdp-get
87
+ - bin/hdp-kill
88
+ - bin/hdp-kill-task
89
+ - bin/hdp-ls
90
+ - bin/hdp-mkdir
91
+ - bin/hdp-mkdirp
92
+ - bin/hdp-mv
93
+ - bin/hdp-parts_to_keys.rb
94
+ - bin/hdp-ps
95
+ - bin/hdp-put
96
+ - bin/hdp-rm
97
+ - bin/hdp-sort
98
+ - bin/hdp-stream
99
+ - bin/hdp-stream-flat
100
+ - bin/hdp-stream2
101
+ - bin/hdp-sync
102
+ - bin/hdp-wc
103
+ - bin/wu-hadoop
104
+ - examples/counter.rb
105
+ - examples/map_only.rb
106
+ - examples/processors.rb
107
+ - examples/sonnet_18.txt
108
+ - examples/tokenizer.rb
109
+ - examples/word_count.rb
110
+ - features/step_definitions/wu_hadoop_steps.rb
111
+ - features/support/env.rb
112
+ - features/wu_hadoop.feature
113
+ - lib/wukong-hadoop.rb
114
+ - lib/wukong-hadoop/configuration.rb
115
+ - lib/wukong-hadoop/driver.rb
116
+ - lib/wukong-hadoop/driver/hadoop_invocation.rb
117
+ - lib/wukong-hadoop/driver/inputs_and_outputs.rb
118
+ - lib/wukong-hadoop/driver/local_invocation.rb
119
+ - lib/wukong-hadoop/driver/map_logic.rb
120
+ - lib/wukong-hadoop/driver/reduce_logic.rb
121
+ - lib/wukong-hadoop/extensions.rb
122
+ - lib/wukong-hadoop/hadoop_env_methods.rb
123
+ - lib/wukong-hadoop/version.rb
124
+ - spec/spec_helper.rb
125
+ - spec/support/driver_helper.rb
126
+ - spec/support/integration_helper.rb
127
+ - spec/wukong-hadoop/driver_spec.rb
128
+ - spec/wukong-hadoop/hadoop_env_methods_spec.rb
129
+ - spec/wukong-hadoop/hadoop_mode_spec.rb
130
+ - spec/wukong-hadoop/local_mode_spec.rb
131
+ - spec/wukong-hadoop/wu_hadoop_spec.rb
132
+ - wukong-hadoop.gemspec
133
+ homepage: https://github.com/infochimps-labs/wukong-hadoop
134
+ licenses:
135
+ - Apache 2.0
136
+ post_install_message:
137
+ rdoc_options: []
138
+ require_paths:
139
+ - lib
140
+ required_ruby_version: !ruby/object:Gem::Requirement
141
+ none: false
142
+ requirements:
143
+ - - ! '>='
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ required_rubygems_version: !ruby/object:Gem::Requirement
147
+ none: false
148
+ requirements:
149
+ - - ! '>='
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ requirements: []
153
+ rubyforge_project:
154
+ rubygems_version: 1.8.23
155
+ signing_key:
156
+ specification_version: 3
157
+ summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
158
+ it, yet handles terabyte-scale computation with ease.
159
+ test_files:
160
+ - spec/spec_helper.rb
161
+ - spec/support/driver_helper.rb
162
+ - spec/support/integration_helper.rb
163
+ - spec/wukong-hadoop/driver_spec.rb
164
+ - spec/wukong-hadoop/hadoop_env_methods_spec.rb
165
+ - spec/wukong-hadoop/hadoop_mode_spec.rb
166
+ - spec/wukong-hadoop/local_mode_spec.rb
167
+ - spec/wukong-hadoop/wu_hadoop_spec.rb
168
+ has_rdoc: