wukong-hadoop 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/.gitignore +59 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +3 -0
  4. data/README.md +339 -0
  5. data/Rakefile +13 -0
  6. data/bin/hdp-bin +44 -0
  7. data/bin/hdp-bzip +23 -0
  8. data/bin/hdp-cat +3 -0
  9. data/bin/hdp-catd +3 -0
  10. data/bin/hdp-cp +3 -0
  11. data/bin/hdp-du +86 -0
  12. data/bin/hdp-get +3 -0
  13. data/bin/hdp-kill +3 -0
  14. data/bin/hdp-kill-task +3 -0
  15. data/bin/hdp-ls +11 -0
  16. data/bin/hdp-mkdir +2 -0
  17. data/bin/hdp-mkdirp +12 -0
  18. data/bin/hdp-mv +3 -0
  19. data/bin/hdp-parts_to_keys.rb +77 -0
  20. data/bin/hdp-ps +3 -0
  21. data/bin/hdp-put +3 -0
  22. data/bin/hdp-rm +32 -0
  23. data/bin/hdp-sort +40 -0
  24. data/bin/hdp-stream +40 -0
  25. data/bin/hdp-stream-flat +22 -0
  26. data/bin/hdp-stream2 +39 -0
  27. data/bin/hdp-sync +17 -0
  28. data/bin/hdp-wc +67 -0
  29. data/bin/wu-hadoop +14 -0
  30. data/examples/counter.rb +17 -0
  31. data/examples/map_only.rb +28 -0
  32. data/examples/processors.rb +4 -0
  33. data/examples/sonnet_18.txt +14 -0
  34. data/examples/tokenizer.rb +28 -0
  35. data/examples/word_count.rb +44 -0
  36. data/features/step_definitions/wu_hadoop_steps.rb +4 -0
  37. data/features/support/env.rb +1 -0
  38. data/features/wu_hadoop.feature +113 -0
  39. data/lib/wukong-hadoop.rb +21 -0
  40. data/lib/wukong-hadoop/configuration.rb +133 -0
  41. data/lib/wukong-hadoop/driver.rb +190 -0
  42. data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
  43. data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
  44. data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
  45. data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
  46. data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
  47. data/lib/wukong-hadoop/extensions.rb +2 -0
  48. data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
  49. data/lib/wukong-hadoop/version.rb +6 -0
  50. data/spec/spec_helper.rb +21 -0
  51. data/spec/support/driver_helper.rb +15 -0
  52. data/spec/support/integration_helper.rb +39 -0
  53. data/spec/wukong-hadoop/driver_spec.rb +117 -0
  54. data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
  55. data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
  56. data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
  57. data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
  58. data/wukong-hadoop.gemspec +33 -0
  59. metadata +168 -0
@@ -0,0 +1,14 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Hadoop::EnvMethods do
4
+
5
+ subject{ Wukong::Processor.new }
6
+
7
+ it{ should respond_to(:input_file) }
8
+ it{ should respond_to(:input_dir) }
9
+ it{ should respond_to(:map_input_start_offset) }
10
+ it{ should respond_to(:map_input_length) }
11
+ it{ should respond_to(:attempt_id) }
12
+ it{ should respond_to(:curr_task_id) }
13
+
14
+ end
@@ -0,0 +1,78 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Hadoop::HadoopInvocation do
4
+
5
+ let(:map_only) { driver('regexp', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
6
+ let(:map_reduce) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
7
+ let(:complex) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye']) }
8
+ let(:custum_io) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
9
+
10
+ context "defining input paths" do
11
+ it "raises an error unless given an --input option" do
12
+ lambda { driver('regexp', output: '/tmp/output').run! }.should raise_error(Wukong::Error, /--input.*required/)
13
+ end
14
+ it "sets its input paths correctly" do
15
+ map_reduce.hadoop_commandline.should match(%r{-input\s+'/tmp/input1,/tmp/input2'})
16
+ end
17
+ it "sets its input format given the --input_format option" do
18
+ custum_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
19
+ end
20
+ end
21
+
22
+ context "defining its output path" do
23
+ it "raises an error unless given an --output option" do
24
+ lambda { driver('regexp', input: '/tmp/output').run! }.should raise_error(Wukong::Error, /--output.*required/)
25
+ end
26
+ it "sets its output path correctly" do
27
+ map_reduce.hadoop_commandline.should match(%r{-output\s+'/tmp/output'})
28
+ end
29
+ it "sets its output format given the --output_format option" do
30
+ custum_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
31
+ end
32
+ end
33
+
34
+ context "defining its mapper and reducer" do
35
+ it "sets its mapper correctly" do
36
+ map_reduce.hadoop_commandline.should match(%r{-mapper\s+'wu-local regexp'})
37
+ end
38
+ it "sets its reducer correctly" do
39
+ map_reduce.hadoop_commandline.should match(%r{-reducer\s+'wu-local count'})
40
+ end
41
+ it "uses a blank reducer for a map-only job" do
42
+ map_only.hadoop_commandline.should match(%r{-reducer\s+''})
43
+ end
44
+ end
45
+
46
+ context "defining Hadoop JobConf options" do
47
+ it "translates friendly names into native ones" do
48
+ complex.hadoop_commandline.should include("-D mapred.job.name='testy'")
49
+ complex.hadoop_commandline.should include("-D mapred.map.tasks=100")
50
+ end
51
+ it "passes options in the given --java_opts option" do
52
+ complex.hadoop_commandline.should include('-D foo.bar=3','-D baz.booz=hello','-D hi.there=bye')
53
+ end
54
+ end
55
+
56
+ context "removing existing output paths" do
57
+ before { Log.stub!(:info) }
58
+ it "will not remove the output path by default" do
59
+ map_reduce.should_not_receive(:remove_output_path!)
60
+ map_reduce.should_receive(:execute_command!)
61
+ map_reduce.run!
62
+ end
63
+ it "will remove the output path when given the --rm option" do
64
+ d = driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true)
65
+ d.should_receive(:remove_output_path!)
66
+ d.should_receive(:execute_command!)
67
+ d.run!
68
+ end
69
+ it "will not remove the output path when given the --rm option AND the --dry_run option" do
70
+ d = driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true)
71
+ d.should_receive(:remove_output_path!)
72
+ d.should_receive(:execute_command!)
73
+ d.run!
74
+ end
75
+
76
+ end
77
+
78
+ end
@@ -0,0 +1,22 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Hadoop::LocalInvocation do
4
+ it "reads from STDIN and writes to STDOUT by default" do
5
+ driver('regexp').local_commandline.should == 'wu-local regexp'
6
+ end
7
+ it "reads from from multiple input paths given the --input option" do
8
+ driver('regexp', :input => '/some/file.tsv,something_else.dat').local_commandline.should == 'cat /some/file.tsv something_else.dat | wu-local regexp'
9
+ end
10
+ it "writes to a file given the --output option" do
11
+ driver('regexp', :output => '/tmp/output.json').local_commandline.should == 'wu-local regexp > /tmp/output.json'
12
+ end
13
+ it "will not perform a sort on a map-only job" do
14
+ driver('regexp').local_commandline.should_not include('sort')
15
+ end
16
+ it "will perform a sort on a map-reduce job" do
17
+ driver('regexp', 'count').local_commandline.should == 'wu-local regexp | sort | wu-local count'
18
+ end
19
+ it "will accept a custom sort command" do
20
+ driver('regexp', 'count', :sort_command => 'sort -n').local_commandline.should == 'wu-local regexp | sort -n | wu-local count'
21
+ end
22
+ end
@@ -0,0 +1,34 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'wu-hadoop' do
4
+ context "without any arguments" do
5
+ let(:subject) { command('wu-hadoop') }
6
+ it {should exit_with(:non_zero) }
7
+ it "displays help on STDERR" do
8
+ should have_stderr("usage: wu-hadoop")
9
+ end
10
+ end
11
+
12
+ context "in local mode" do
13
+ context "on a map-only job" do
14
+ let(:subject) { command('wu-hadoop', example_script('tokenizer.rb'), "--mode=local", "--input=#{example_script('sonnet_18.txt')}") }
15
+ it { should exit_with(0) }
16
+ it { should have_stdout('Shall', 'I', 'compare', 'thee', 'to', 'a', "summer's", 'day') }
17
+ end
18
+
19
+ context "on a map-reduce job" do
20
+ let(:subject) { command('wu-hadoop', example_script('word_count.rb'), "--mode=local", "--input=#{example_script('sonnet_18.txt')}") }
21
+ it { should exit_with(0) }
22
+ it { should have_stdout(/complexion\s+1/, /Death\s+1/, /temperate\s+1/) }
23
+ end
24
+ end
25
+
26
+ context "in Hadoop mode" do
27
+ context "on a map-only job" do
28
+ let(:subject) { command('wu-hadoop', example_script('tokenizer.rb'), "--mode=hadoop", "--input=/data/in", "--output=/data/out", "--dry_run") }
29
+ it { should exit_with(0) }
30
+ it { should have_stdout(%r{jar.*hadoop.*streaming.*\.jar}, %r{-mapper.+tokenizer\.rb}, %r{-input.*/data/in}, %r{-output.*/data/out}) }
31
+ end
32
+ end
33
+
34
+ end
@@ -0,0 +1,33 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/wukong-hadoop/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = 'wukong-hadoop'
6
+ gem.homepage = 'https://github.com/infochimps-labs/wukong-hadoop'
7
+ gem.licenses = ["Apache 2.0"]
8
+ gem.email = 'coders@infochimps.org'
9
+ gem.authors = ['Infochimps', 'Philip (flip) Kromer', 'Travis Dempsey']
10
+ gem.version = Wukong::Hadoop::VERSION
11
+
12
+ gem.summary = 'Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.'
13
+ gem.description = <<-EOF
14
+ Treat your dataset like a:
15
+
16
+ * stream of lines when it's efficient to process by lines
17
+ * stream of field arrays when it's efficient to deal directly with fields
18
+ * stream of lightweight objects when it's efficient to deal with objects
19
+
20
+ Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.
21
+ EOF
22
+
23
+ gem.files = `git ls-files`.split("\n")
24
+ gem.executables = ['wu-hadoop']
25
+ gem.test_files = gem.files.grep(/^spec/)
26
+ gem.require_paths = ['lib']
27
+
28
+ gem.add_dependency('wukong', '3.0.0.pre2')
29
+
30
+ gem.add_development_dependency 'rake', '~> 0.9'
31
+ gem.add_development_dependency 'rspec', '~> 2'
32
+
33
+ end
metadata ADDED
@@ -0,0 +1,168 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wukong-hadoop
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Infochimps
9
+ - Philip (flip) Kromer
10
+ - Travis Dempsey
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2012-12-01 00:00:00.000000000 Z
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: wukong
18
+ requirement: !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - '='
22
+ - !ruby/object:Gem::Version
23
+ version: 3.0.0.pre2
24
+ type: :runtime
25
+ prerelease: false
26
+ version_requirements: !ruby/object:Gem::Requirement
27
+ none: false
28
+ requirements:
29
+ - - '='
30
+ - !ruby/object:Gem::Version
31
+ version: 3.0.0.pre2
32
+ - !ruby/object:Gem::Dependency
33
+ name: rake
34
+ requirement: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ~>
38
+ - !ruby/object:Gem::Version
39
+ version: '0.9'
40
+ type: :development
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ none: false
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '0.9'
48
+ - !ruby/object:Gem::Dependency
49
+ name: rspec
50
+ requirement: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ~>
54
+ - !ruby/object:Gem::Version
55
+ version: '2'
56
+ type: :development
57
+ prerelease: false
58
+ version_requirements: !ruby/object:Gem::Requirement
59
+ none: false
60
+ requirements:
61
+ - - ~>
62
+ - !ruby/object:Gem::Version
63
+ version: '2'
64
+ description: ! " Treat your dataset like a:\n\n * stream of lines when it's
65
+ efficient to process by lines\n * stream of field arrays when it's efficient
66
+ to deal directly with fields\n * stream of lightweight objects when it's efficient
67
+ to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query
68
+ language, and the cat on your command line.\n"
69
+ email: coders@infochimps.org
70
+ executables:
71
+ - wu-hadoop
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - .gitignore
76
+ - .rspec
77
+ - Gemfile
78
+ - README.md
79
+ - Rakefile
80
+ - bin/hdp-bin
81
+ - bin/hdp-bzip
82
+ - bin/hdp-cat
83
+ - bin/hdp-catd
84
+ - bin/hdp-cp
85
+ - bin/hdp-du
86
+ - bin/hdp-get
87
+ - bin/hdp-kill
88
+ - bin/hdp-kill-task
89
+ - bin/hdp-ls
90
+ - bin/hdp-mkdir
91
+ - bin/hdp-mkdirp
92
+ - bin/hdp-mv
93
+ - bin/hdp-parts_to_keys.rb
94
+ - bin/hdp-ps
95
+ - bin/hdp-put
96
+ - bin/hdp-rm
97
+ - bin/hdp-sort
98
+ - bin/hdp-stream
99
+ - bin/hdp-stream-flat
100
+ - bin/hdp-stream2
101
+ - bin/hdp-sync
102
+ - bin/hdp-wc
103
+ - bin/wu-hadoop
104
+ - examples/counter.rb
105
+ - examples/map_only.rb
106
+ - examples/processors.rb
107
+ - examples/sonnet_18.txt
108
+ - examples/tokenizer.rb
109
+ - examples/word_count.rb
110
+ - features/step_definitions/wu_hadoop_steps.rb
111
+ - features/support/env.rb
112
+ - features/wu_hadoop.feature
113
+ - lib/wukong-hadoop.rb
114
+ - lib/wukong-hadoop/configuration.rb
115
+ - lib/wukong-hadoop/driver.rb
116
+ - lib/wukong-hadoop/driver/hadoop_invocation.rb
117
+ - lib/wukong-hadoop/driver/inputs_and_outputs.rb
118
+ - lib/wukong-hadoop/driver/local_invocation.rb
119
+ - lib/wukong-hadoop/driver/map_logic.rb
120
+ - lib/wukong-hadoop/driver/reduce_logic.rb
121
+ - lib/wukong-hadoop/extensions.rb
122
+ - lib/wukong-hadoop/hadoop_env_methods.rb
123
+ - lib/wukong-hadoop/version.rb
124
+ - spec/spec_helper.rb
125
+ - spec/support/driver_helper.rb
126
+ - spec/support/integration_helper.rb
127
+ - spec/wukong-hadoop/driver_spec.rb
128
+ - spec/wukong-hadoop/hadoop_env_methods_spec.rb
129
+ - spec/wukong-hadoop/hadoop_mode_spec.rb
130
+ - spec/wukong-hadoop/local_mode_spec.rb
131
+ - spec/wukong-hadoop/wu_hadoop_spec.rb
132
+ - wukong-hadoop.gemspec
133
+ homepage: https://github.com/infochimps-labs/wukong-hadoop
134
+ licenses:
135
+ - Apache 2.0
136
+ post_install_message:
137
+ rdoc_options: []
138
+ require_paths:
139
+ - lib
140
+ required_ruby_version: !ruby/object:Gem::Requirement
141
+ none: false
142
+ requirements:
143
+ - - ! '>='
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ required_rubygems_version: !ruby/object:Gem::Requirement
147
+ none: false
148
+ requirements:
149
+ - - ! '>='
150
+ - !ruby/object:Gem::Version
151
+ version: '0'
152
+ requirements: []
153
+ rubyforge_project:
154
+ rubygems_version: 1.8.23
155
+ signing_key:
156
+ specification_version: 3
157
+ summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
158
+ it, yet handles terabyte-scale computation with ease.
159
+ test_files:
160
+ - spec/spec_helper.rb
161
+ - spec/support/driver_helper.rb
162
+ - spec/support/integration_helper.rb
163
+ - spec/wukong-hadoop/driver_spec.rb
164
+ - spec/wukong-hadoop/hadoop_env_methods_spec.rb
165
+ - spec/wukong-hadoop/hadoop_mode_spec.rb
166
+ - spec/wukong-hadoop/local_mode_spec.rb
167
+ - spec/wukong-hadoop/wu_hadoop_spec.rb
168
+ has_rdoc: