wukong-hadoop 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +59 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/README.md +339 -0
- data/Rakefile +13 -0
- data/bin/hdp-bin +44 -0
- data/bin/hdp-bzip +23 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-cp +3 -0
- data/bin/hdp-du +86 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-kill-task +3 -0
- data/bin/hdp-ls +11 -0
- data/bin/hdp-mkdir +2 -0
- data/bin/hdp-mkdirp +12 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +32 -0
- data/bin/hdp-sort +40 -0
- data/bin/hdp-stream +40 -0
- data/bin/hdp-stream-flat +22 -0
- data/bin/hdp-stream2 +39 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/wu-hadoop +14 -0
- data/examples/counter.rb +17 -0
- data/examples/map_only.rb +28 -0
- data/examples/processors.rb +4 -0
- data/examples/sonnet_18.txt +14 -0
- data/examples/tokenizer.rb +28 -0
- data/examples/word_count.rb +44 -0
- data/features/step_definitions/wu_hadoop_steps.rb +4 -0
- data/features/support/env.rb +1 -0
- data/features/wu_hadoop.feature +113 -0
- data/lib/wukong-hadoop.rb +21 -0
- data/lib/wukong-hadoop/configuration.rb +133 -0
- data/lib/wukong-hadoop/driver.rb +190 -0
- data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
- data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
- data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
- data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
- data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
- data/lib/wukong-hadoop/extensions.rb +2 -0
- data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
- data/lib/wukong-hadoop/version.rb +6 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +39 -0
- data/spec/wukong-hadoop/driver_spec.rb +117 -0
- data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
- data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
- data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
- data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
- data/wukong-hadoop.gemspec +33 -0
- metadata +168 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wukong::Hadoop::EnvMethods do
|
4
|
+
|
5
|
+
subject{ Wukong::Processor.new }
|
6
|
+
|
7
|
+
it{ should respond_to(:input_file) }
|
8
|
+
it{ should respond_to(:input_dir) }
|
9
|
+
it{ should respond_to(:map_input_start_offset) }
|
10
|
+
it{ should respond_to(:map_input_length) }
|
11
|
+
it{ should respond_to(:attempt_id) }
|
12
|
+
it{ should respond_to(:curr_task_id) }
|
13
|
+
|
14
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wukong::Hadoop::HadoopInvocation do
|
4
|
+
|
5
|
+
let(:map_only) { driver('regexp', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
|
6
|
+
let(:map_reduce) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
|
7
|
+
let(:complex) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye']) }
|
8
|
+
let(:custum_io) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
|
9
|
+
|
10
|
+
context "defining input paths" do
|
11
|
+
it "raises an error unless given an --input option" do
|
12
|
+
lambda { driver('regexp', output: '/tmp/output').run! }.should raise_error(Wukong::Error, /--input.*required/)
|
13
|
+
end
|
14
|
+
it "sets its input paths correctly" do
|
15
|
+
map_reduce.hadoop_commandline.should match(%r{-input\s+'/tmp/input1,/tmp/input2'})
|
16
|
+
end
|
17
|
+
it "sets its input format given the --input_format option" do
|
18
|
+
custum_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
context "defining its output path" do
|
23
|
+
it "raises an error unless given an --output option" do
|
24
|
+
lambda { driver('regexp', input: '/tmp/output').run! }.should raise_error(Wukong::Error, /--output.*required/)
|
25
|
+
end
|
26
|
+
it "sets its output path correctly" do
|
27
|
+
map_reduce.hadoop_commandline.should match(%r{-output\s+'/tmp/output'})
|
28
|
+
end
|
29
|
+
it "sets its output format given the --output_format option" do
|
30
|
+
custum_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context "defining its mapper and reducer" do
|
35
|
+
it "sets its mapper correctly" do
|
36
|
+
map_reduce.hadoop_commandline.should match(%r{-mapper\s+'wu-local regexp'})
|
37
|
+
end
|
38
|
+
it "sets its reducer correctly" do
|
39
|
+
map_reduce.hadoop_commandline.should match(%r{-reducer\s+'wu-local count'})
|
40
|
+
end
|
41
|
+
it "uses a blank reducer for a map-only job" do
|
42
|
+
map_only.hadoop_commandline.should match(%r{-reducer\s+''})
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
context "defining Hadoop JobConf options" do
|
47
|
+
it "translates friendly names into native ones" do
|
48
|
+
complex.hadoop_commandline.should include("-D mapred.job.name='testy'")
|
49
|
+
complex.hadoop_commandline.should include("-D mapred.map.tasks=100")
|
50
|
+
end
|
51
|
+
it "passes options in the given --java_opts option" do
|
52
|
+
complex.hadoop_commandline.should include('-D foo.bar=3','-D baz.booz=hello','-D hi.there=bye')
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
context "removing existing output paths" do
|
57
|
+
before { Log.stub!(:info) }
|
58
|
+
it "will not remove the output path by default" do
|
59
|
+
map_reduce.should_not_receive(:remove_output_path!)
|
60
|
+
map_reduce.should_receive(:execute_command!)
|
61
|
+
map_reduce.run!
|
62
|
+
end
|
63
|
+
it "will remove the output path when given the --rm option" do
|
64
|
+
d = driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true)
|
65
|
+
d.should_receive(:remove_output_path!)
|
66
|
+
d.should_receive(:execute_command!)
|
67
|
+
d.run!
|
68
|
+
end
|
69
|
+
it "will not remove the output path when given the --rm option AND the --dry_run option" do
|
70
|
+
d = driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true)
|
71
|
+
d.should_receive(:remove_output_path!)
|
72
|
+
d.should_receive(:execute_command!)
|
73
|
+
d.run!
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wukong::Hadoop::LocalInvocation do
|
4
|
+
it "reads from STDIN and writes to STDOUT by default" do
|
5
|
+
driver('regexp').local_commandline.should == 'wu-local regexp'
|
6
|
+
end
|
7
|
+
it "reads from from multiple input paths given the --input option" do
|
8
|
+
driver('regexp', :input => '/some/file.tsv,something_else.dat').local_commandline.should == 'cat /some/file.tsv something_else.dat | wu-local regexp'
|
9
|
+
end
|
10
|
+
it "writes to a file given the --output option" do
|
11
|
+
driver('regexp', :output => '/tmp/output.json').local_commandline.should == 'wu-local regexp > /tmp/output.json'
|
12
|
+
end
|
13
|
+
it "will not perform a sort on a map-only job" do
|
14
|
+
driver('regexp').local_commandline.should_not include('sort')
|
15
|
+
end
|
16
|
+
it "will perform a sort on a map-reduce job" do
|
17
|
+
driver('regexp', 'count').local_commandline.should == 'wu-local regexp | sort | wu-local count'
|
18
|
+
end
|
19
|
+
it "will accept a custom sort command" do
|
20
|
+
driver('regexp', 'count', :sort_command => 'sort -n').local_commandline.should == 'wu-local regexp | sort -n | wu-local count'
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'wu-hadoop' do
|
4
|
+
context "without any arguments" do
|
5
|
+
let(:subject) { command('wu-hadoop') }
|
6
|
+
it {should exit_with(:non_zero) }
|
7
|
+
it "displays help on STDERR" do
|
8
|
+
should have_stderr("usage: wu-hadoop")
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
context "in local mode" do
|
13
|
+
context "on a map-only job" do
|
14
|
+
let(:subject) { command('wu-hadoop', example_script('tokenizer.rb'), "--mode=local", "--input=#{example_script('sonnet_18.txt')}") }
|
15
|
+
it { should exit_with(0) }
|
16
|
+
it { should have_stdout('Shall', 'I', 'compare', 'thee', 'to', 'a', "summer's", 'day') }
|
17
|
+
end
|
18
|
+
|
19
|
+
context "on a map-reduce job" do
|
20
|
+
let(:subject) { command('wu-hadoop', example_script('word_count.rb'), "--mode=local", "--input=#{example_script('sonnet_18.txt')}") }
|
21
|
+
it { should exit_with(0) }
|
22
|
+
it { should have_stdout(/complexion\s+1/, /Death\s+1/, /temperate\s+1/) }
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
context "in Hadoop mode" do
|
27
|
+
context "on a map-only job" do
|
28
|
+
let(:subject) { command('wu-hadoop', example_script('tokenizer.rb'), "--mode=hadoop", "--input=/data/in", "--output=/data/out", "--dry_run") }
|
29
|
+
it { should exit_with(0) }
|
30
|
+
it { should have_stdout(%r{jar.*hadoop.*streaming.*\.jar}, %r{-mapper.+tokenizer\.rb}, %r{-input.*/data/in}, %r{-output.*/data/out}) }
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/wukong-hadoop/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.name = 'wukong-hadoop'
|
6
|
+
gem.homepage = 'https://github.com/infochimps-labs/wukong-hadoop'
|
7
|
+
gem.licenses = ["Apache 2.0"]
|
8
|
+
gem.email = 'coders@infochimps.org'
|
9
|
+
gem.authors = ['Infochimps', 'Philip (flip) Kromer', 'Travis Dempsey']
|
10
|
+
gem.version = Wukong::Hadoop::VERSION
|
11
|
+
|
12
|
+
gem.summary = 'Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.'
|
13
|
+
gem.description = <<-EOF
|
14
|
+
Treat your dataset like a:
|
15
|
+
|
16
|
+
* stream of lines when it's efficient to process by lines
|
17
|
+
* stream of field arrays when it's efficient to deal directly with fields
|
18
|
+
* stream of lightweight objects when it's efficient to deal with objects
|
19
|
+
|
20
|
+
Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.
|
21
|
+
EOF
|
22
|
+
|
23
|
+
gem.files = `git ls-files`.split("\n")
|
24
|
+
gem.executables = ['wu-hadoop']
|
25
|
+
gem.test_files = gem.files.grep(/^spec/)
|
26
|
+
gem.require_paths = ['lib']
|
27
|
+
|
28
|
+
gem.add_dependency('wukong', '3.0.0.pre2')
|
29
|
+
|
30
|
+
gem.add_development_dependency 'rake', '~> 0.9'
|
31
|
+
gem.add_development_dependency 'rspec', '~> 2'
|
32
|
+
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wukong-hadoop
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Infochimps
|
9
|
+
- Philip (flip) Kromer
|
10
|
+
- Travis Dempsey
|
11
|
+
autorequire:
|
12
|
+
bindir: bin
|
13
|
+
cert_chain: []
|
14
|
+
date: 2012-12-01 00:00:00.000000000 Z
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: wukong
|
18
|
+
requirement: !ruby/object:Gem::Requirement
|
19
|
+
none: false
|
20
|
+
requirements:
|
21
|
+
- - '='
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 3.0.0.pre2
|
24
|
+
type: :runtime
|
25
|
+
prerelease: false
|
26
|
+
version_requirements: !ruby/object:Gem::Requirement
|
27
|
+
none: false
|
28
|
+
requirements:
|
29
|
+
- - '='
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: 3.0.0.pre2
|
32
|
+
- !ruby/object:Gem::Dependency
|
33
|
+
name: rake
|
34
|
+
requirement: !ruby/object:Gem::Requirement
|
35
|
+
none: false
|
36
|
+
requirements:
|
37
|
+
- - ~>
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0.9'
|
40
|
+
type: :development
|
41
|
+
prerelease: false
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
none: false
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0.9'
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: rspec
|
50
|
+
requirement: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ~>
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '2'
|
56
|
+
type: :development
|
57
|
+
prerelease: false
|
58
|
+
version_requirements: !ruby/object:Gem::Requirement
|
59
|
+
none: false
|
60
|
+
requirements:
|
61
|
+
- - ~>
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '2'
|
64
|
+
description: ! " Treat your dataset like a:\n\n * stream of lines when it's
|
65
|
+
efficient to process by lines\n * stream of field arrays when it's efficient
|
66
|
+
to deal directly with fields\n * stream of lightweight objects when it's efficient
|
67
|
+
to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query
|
68
|
+
language, and the cat on your command line.\n"
|
69
|
+
email: coders@infochimps.org
|
70
|
+
executables:
|
71
|
+
- wu-hadoop
|
72
|
+
extensions: []
|
73
|
+
extra_rdoc_files: []
|
74
|
+
files:
|
75
|
+
- .gitignore
|
76
|
+
- .rspec
|
77
|
+
- Gemfile
|
78
|
+
- README.md
|
79
|
+
- Rakefile
|
80
|
+
- bin/hdp-bin
|
81
|
+
- bin/hdp-bzip
|
82
|
+
- bin/hdp-cat
|
83
|
+
- bin/hdp-catd
|
84
|
+
- bin/hdp-cp
|
85
|
+
- bin/hdp-du
|
86
|
+
- bin/hdp-get
|
87
|
+
- bin/hdp-kill
|
88
|
+
- bin/hdp-kill-task
|
89
|
+
- bin/hdp-ls
|
90
|
+
- bin/hdp-mkdir
|
91
|
+
- bin/hdp-mkdirp
|
92
|
+
- bin/hdp-mv
|
93
|
+
- bin/hdp-parts_to_keys.rb
|
94
|
+
- bin/hdp-ps
|
95
|
+
- bin/hdp-put
|
96
|
+
- bin/hdp-rm
|
97
|
+
- bin/hdp-sort
|
98
|
+
- bin/hdp-stream
|
99
|
+
- bin/hdp-stream-flat
|
100
|
+
- bin/hdp-stream2
|
101
|
+
- bin/hdp-sync
|
102
|
+
- bin/hdp-wc
|
103
|
+
- bin/wu-hadoop
|
104
|
+
- examples/counter.rb
|
105
|
+
- examples/map_only.rb
|
106
|
+
- examples/processors.rb
|
107
|
+
- examples/sonnet_18.txt
|
108
|
+
- examples/tokenizer.rb
|
109
|
+
- examples/word_count.rb
|
110
|
+
- features/step_definitions/wu_hadoop_steps.rb
|
111
|
+
- features/support/env.rb
|
112
|
+
- features/wu_hadoop.feature
|
113
|
+
- lib/wukong-hadoop.rb
|
114
|
+
- lib/wukong-hadoop/configuration.rb
|
115
|
+
- lib/wukong-hadoop/driver.rb
|
116
|
+
- lib/wukong-hadoop/driver/hadoop_invocation.rb
|
117
|
+
- lib/wukong-hadoop/driver/inputs_and_outputs.rb
|
118
|
+
- lib/wukong-hadoop/driver/local_invocation.rb
|
119
|
+
- lib/wukong-hadoop/driver/map_logic.rb
|
120
|
+
- lib/wukong-hadoop/driver/reduce_logic.rb
|
121
|
+
- lib/wukong-hadoop/extensions.rb
|
122
|
+
- lib/wukong-hadoop/hadoop_env_methods.rb
|
123
|
+
- lib/wukong-hadoop/version.rb
|
124
|
+
- spec/spec_helper.rb
|
125
|
+
- spec/support/driver_helper.rb
|
126
|
+
- spec/support/integration_helper.rb
|
127
|
+
- spec/wukong-hadoop/driver_spec.rb
|
128
|
+
- spec/wukong-hadoop/hadoop_env_methods_spec.rb
|
129
|
+
- spec/wukong-hadoop/hadoop_mode_spec.rb
|
130
|
+
- spec/wukong-hadoop/local_mode_spec.rb
|
131
|
+
- spec/wukong-hadoop/wu_hadoop_spec.rb
|
132
|
+
- wukong-hadoop.gemspec
|
133
|
+
homepage: https://github.com/infochimps-labs/wukong-hadoop
|
134
|
+
licenses:
|
135
|
+
- Apache 2.0
|
136
|
+
post_install_message:
|
137
|
+
rdoc_options: []
|
138
|
+
require_paths:
|
139
|
+
- lib
|
140
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
141
|
+
none: false
|
142
|
+
requirements:
|
143
|
+
- - ! '>='
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
147
|
+
none: false
|
148
|
+
requirements:
|
149
|
+
- - ! '>='
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0'
|
152
|
+
requirements: []
|
153
|
+
rubyforge_project:
|
154
|
+
rubygems_version: 1.8.23
|
155
|
+
signing_key:
|
156
|
+
specification_version: 3
|
157
|
+
summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
|
158
|
+
it, yet handles terabyte-scale computation with ease.
|
159
|
+
test_files:
|
160
|
+
- spec/spec_helper.rb
|
161
|
+
- spec/support/driver_helper.rb
|
162
|
+
- spec/support/integration_helper.rb
|
163
|
+
- spec/wukong-hadoop/driver_spec.rb
|
164
|
+
- spec/wukong-hadoop/hadoop_env_methods_spec.rb
|
165
|
+
- spec/wukong-hadoop/hadoop_mode_spec.rb
|
166
|
+
- spec/wukong-hadoop/local_mode_spec.rb
|
167
|
+
- spec/wukong-hadoop/wu_hadoop_spec.rb
|
168
|
+
has_rdoc:
|