hadoop-jruby-connector 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/hjc/hadoop_streaming.rb +18 -11
- metadata +8 -18
- data/spec/hjc/fs_shell_spec.rb +0 -19
- data/spec/hjc/hadoop_streaming_spec.rb +0 -132
- data/spec/hjc/jar_job_spec.rb +0 -25
- data/spec/hjc/job_monitor_spec.rb +0 -55
- data/spec/hjc/util_spec.rb +0 -25
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.9
|
data/lib/hjc/hadoop_streaming.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Hjc
|
2
2
|
class HadoopStreaming
|
3
|
-
attr_accessor :
|
3
|
+
attr_accessor :input_paths, :output_path, :mapper_path, :reducer_path, :jobconf
|
4
4
|
attr_accessor :local, :debug
|
5
5
|
|
6
6
|
def initialize
|
@@ -19,13 +19,20 @@ module Hjc
|
|
19
19
|
@ret == 0 # success if job returned 0
|
20
20
|
end
|
21
21
|
|
22
|
-
def
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
22
|
+
def input_path=(input) # compatibility
|
23
|
+
self.input = input
|
24
|
+
end
|
25
|
+
|
26
|
+
def input=(*inputs)
|
27
|
+
@input_paths = []
|
28
|
+
inputs.each do |input|
|
29
|
+
# input param seems to explain exact path on Hadoop streaming..
|
30
|
+
file = Util.to_temp_file('input', input)
|
31
|
+
@input_paths << Util.rel_path(file)
|
32
|
+
unless @local # path seems on HDFS
|
33
|
+
sh = FsShell.new
|
34
|
+
sh.put(file.path, Util.rel_path(file))
|
35
|
+
end
|
29
36
|
end
|
30
37
|
end
|
31
38
|
|
@@ -45,7 +52,7 @@ module Hjc
|
|
45
52
|
|
46
53
|
def args
|
47
54
|
concated_args = []
|
48
|
-
concated_args.concat ['-input',
|
55
|
+
concated_args.concat @input_paths.collect{|e| ['-input', e]}.flatten if @input_paths
|
49
56
|
concated_args.concat ['-output' ,@output_path] if @output_path
|
50
57
|
concated_args.concat ['-mapper', @mapper_path] if @mapper_path
|
51
58
|
concated_args.concat ['-reducer', @reducer_path] if @reducer_path
|
@@ -54,11 +61,11 @@ module Hjc
|
|
54
61
|
concated_args.concat ['-debug'] if @debug
|
55
62
|
|
56
63
|
@jobconf.each do |k, v|
|
57
|
-
|
64
|
+
concated_args += ['-jobconf', "#{k}=#{v}"]
|
58
65
|
end
|
59
66
|
|
60
67
|
@files.each do |k, v|
|
61
|
-
|
68
|
+
concated_args.concat ["-file", v.path]
|
62
69
|
end
|
63
70
|
|
64
71
|
puts "args: #{concated_args.join(' ')}" if @debug
|
metadata
CHANGED
@@ -1,12 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hadoop-jruby-connector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
prerelease:
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 0
|
8
|
-
- 8
|
9
|
-
version: 0.0.8
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.9
|
10
6
|
platform: ruby
|
11
7
|
authors:
|
12
8
|
- Koichi Fujikawa
|
@@ -14,7 +10,7 @@ autorequire:
|
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
12
|
|
17
|
-
date: 2011-
|
13
|
+
date: 2011-10-27 00:00:00 -07:00
|
18
14
|
default_executable:
|
19
15
|
dependencies: []
|
20
16
|
|
@@ -46,29 +42,23 @@ rdoc_options: []
|
|
46
42
|
require_paths:
|
47
43
|
- lib
|
48
44
|
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
49
46
|
requirements:
|
50
47
|
- - ">="
|
51
48
|
- !ruby/object:Gem::Version
|
52
|
-
segments:
|
53
|
-
- 0
|
54
49
|
version: "0"
|
55
50
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
56
52
|
requirements:
|
57
53
|
- - ">="
|
58
54
|
- !ruby/object:Gem::Version
|
59
|
-
segments:
|
60
|
-
- 0
|
61
55
|
version: "0"
|
62
56
|
requirements: []
|
63
57
|
|
64
58
|
rubyforge_project:
|
65
|
-
rubygems_version: 1.
|
59
|
+
rubygems_version: 1.5.1
|
66
60
|
signing_key:
|
67
61
|
specification_version: 3
|
68
62
|
summary: Hadoop connector by JRuby
|
69
|
-
test_files:
|
70
|
-
|
71
|
-
- spec/hjc/hadoop_streaming_spec.rb
|
72
|
-
- spec/hjc/jar_job_spec.rb
|
73
|
-
- spec/hjc/job_monitor_spec.rb
|
74
|
-
- spec/hjc/util_spec.rb
|
63
|
+
test_files: []
|
64
|
+
|
data/spec/hjc/fs_shell_spec.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
require 'hjc'
|
2
|
-
|
3
|
-
module Hjc
|
4
|
-
describe FsShell do
|
5
|
-
it 'put and get file to hdfs, and delete' do
|
6
|
-
localfile = Util.to_temp_file('localfile', 'fs shell test')
|
7
|
-
|
8
|
-
shell = FsShell.new
|
9
|
-
|
10
|
-
shell.put(localfile.path, 'remotefile')
|
11
|
-
shell.get('remotefile', 'tmp/returnedfile')
|
12
|
-
|
13
|
-
File.open('tmp/returnedfile').read.should == 'fs shell test'
|
14
|
-
|
15
|
-
shell.rm('remotefile')
|
16
|
-
File.delete('tmp/returnedfile')
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
@@ -1,132 +0,0 @@
|
|
1
|
-
require 'hjc'
|
2
|
-
require 'fileutils'
|
3
|
-
|
4
|
-
module Hjc
|
5
|
-
describe HadoopStreaming do
|
6
|
-
TMP_DIR = 'tmp'
|
7
|
-
|
8
|
-
before :all do
|
9
|
-
@map_script = Util.to_temp_file('map.rb', MAPPER, :mod => 0700)
|
10
|
-
@reduce_script = Util.to_temp_file('reduce.rb', REDUCER, :mod => 0700)
|
11
|
-
@data_file = Util.to_temp_file('testdata', TEST_DATA)
|
12
|
-
@output_dir = TMP_DIR + '/out' + Time.new.to_i.to_s
|
13
|
-
end
|
14
|
-
|
15
|
-
it 'create args for hadoop streaming' do
|
16
|
-
job = HadoopStreaming.new
|
17
|
-
job.input_path = "input"
|
18
|
-
job.output_path = "outdir"
|
19
|
-
job.mapper_path = "mapper"
|
20
|
-
job.reducer_path = "reducer"
|
21
|
-
|
22
|
-
job.args.join(" ").should == "-input input -output outdir -mapper mapper -reducer reducer"
|
23
|
-
end
|
24
|
-
|
25
|
-
it 'create args for hadoop streaming with files' do
|
26
|
-
job = HadoopStreaming.new
|
27
|
-
job.input = 'input'
|
28
|
-
job.mapper = 'mapper'
|
29
|
-
job.reducer = 'reducer'
|
30
|
-
|
31
|
-
job.args.should include('-file')
|
32
|
-
job.args.join(" ").should match(/input/)
|
33
|
-
job.args.join(" ").should match(/mapper/)
|
34
|
-
job.args.join(" ").should match(/reducer/)
|
35
|
-
end
|
36
|
-
|
37
|
-
it 'can run Hadoop streaming job with path' do
|
38
|
-
pending 'path does not work'
|
39
|
-
job = HadoopStreaming.new
|
40
|
-
job.input_path = "file://" + File.expand_path(@data_file.path)
|
41
|
-
job.output_path = @output_dir
|
42
|
-
job.mapper_path = @map_script.path
|
43
|
-
job.reducer_path = @reduce_script.path
|
44
|
-
job.local = true
|
45
|
-
|
46
|
-
job.run
|
47
|
-
|
48
|
-
assert_result
|
49
|
-
|
50
|
-
clean_output
|
51
|
-
end
|
52
|
-
|
53
|
-
it 'can run Hadoop streaming job with string' do
|
54
|
-
pending
|
55
|
-
job = HadoopStreaming.new
|
56
|
-
job.input = TEST_DATA
|
57
|
-
job.output_path = @output_dir
|
58
|
-
job.mapper = MAPPER
|
59
|
-
job.reducer = REDUCER
|
60
|
-
job.local = true
|
61
|
-
|
62
|
-
job.run
|
63
|
-
|
64
|
-
assert_result
|
65
|
-
|
66
|
-
clean_output
|
67
|
-
end
|
68
|
-
|
69
|
-
it 'sets conf params' do
|
70
|
-
job = HadoopStreaming.new
|
71
|
-
job.input_path = "input"
|
72
|
-
job.output_path = "outdir"
|
73
|
-
job.jobconf['hoge'] = "fuga"
|
74
|
-
|
75
|
-
job.args.join(" ").should == %Q!-input input -output outdir -jobconf hoge=fuga!
|
76
|
-
end
|
77
|
-
|
78
|
-
describe '.add_file' do
|
79
|
-
it 'adds -file option' do
|
80
|
-
job = HadoopStreaming.new
|
81
|
-
file = Tempfile.new('additional.txt', 'tmp')
|
82
|
-
job.add_file(file)
|
83
|
-
|
84
|
-
job.args.join(" ").should match("-file")
|
85
|
-
job.args.join(" ").should match("tmp/additional.txt")
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
def assert_result
|
90
|
-
File.open(File.join(@output_dir, 'part-00000')) do |f|
|
91
|
-
h = {}
|
92
|
-
f.readlines.each do |line|
|
93
|
-
a = line.split /\t/
|
94
|
-
h[a[0]] = a[1].chomp
|
95
|
-
end
|
96
|
-
h.should == {'bar' => '1', 'foo' => '3', 'fuga' => '2', 'hoge' => '4'}
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
def clean_output
|
101
|
-
FileUtils.rm_r([@output_dir])
|
102
|
-
end
|
103
|
-
|
104
|
-
MAPPER = <<-'EOF'
|
105
|
-
#!/usr/bin/env ruby
|
106
|
-
ARGF.each do |line|
|
107
|
-
line.chomp!
|
108
|
-
line.split.each do |word|
|
109
|
-
puts "#{word}\t1"
|
110
|
-
end
|
111
|
-
end
|
112
|
-
EOF
|
113
|
-
|
114
|
-
REDUCER = <<-'EOF'
|
115
|
-
#!/usr/bin/env ruby
|
116
|
-
count = Hash.new {|h,k| h[k] = 0}
|
117
|
-
ARGF.each do |line|
|
118
|
-
line.chomp!
|
119
|
-
key, value = line.split(/\t/)
|
120
|
-
count[key] += 1
|
121
|
-
end
|
122
|
-
count.each do |k,v|
|
123
|
-
puts "#{k}\t#{v}"
|
124
|
-
end
|
125
|
-
EOF
|
126
|
-
|
127
|
-
TEST_DATA = <<-'EOF'
|
128
|
-
hoge fuga foo hoge foo
|
129
|
-
foo bar hoge hoge fuga
|
130
|
-
EOF
|
131
|
-
end
|
132
|
-
end
|
data/spec/hjc/jar_job_spec.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
require 'hjc'
|
2
|
-
|
3
|
-
module Hjc
|
4
|
-
describe JarJob do
|
5
|
-
|
6
|
-
before :all do
|
7
|
-
@example_jar = Dir.glob(File.join(Util.hadoop_home, 'hadoop-*-examples.jar')).first
|
8
|
-
end
|
9
|
-
|
10
|
-
it 'creates hadoop jar job args' do
|
11
|
-
job = JarJob.new
|
12
|
-
job.jar_args = %w!pi 1 10!
|
13
|
-
job.args.join(" ").should == 'pi 1 10'
|
14
|
-
end
|
15
|
-
|
16
|
-
it 'can run hadoop jar job' do
|
17
|
-
job = JarJob.new
|
18
|
-
job.jar_file = @example_jar
|
19
|
-
job.main_class = 'org.apache.hadoop.examples.ExampleDriver'
|
20
|
-
job.jar_args = %w!pi 1 10!
|
21
|
-
p job.args
|
22
|
-
job.run
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
@@ -1,55 +0,0 @@
|
|
1
|
-
require 'hjc'
|
2
|
-
|
3
|
-
module Hjc
|
4
|
-
describe JobMonitor do
|
5
|
-
|
6
|
-
before :each do
|
7
|
-
@example_jar = Dir.glob(File.join(Util.hadoop_home, 'hadoop-*-examples.jar')).first
|
8
|
-
s = Hjc::FsShell.new
|
9
|
-
s.rmr('PiEstimator_TMP_3_141592654')
|
10
|
-
end
|
11
|
-
|
12
|
-
it 'monitors hadoop job' do
|
13
|
-
pending
|
14
|
-
run_job_async
|
15
|
-
sleep 5
|
16
|
-
monitor = JobMonitor.new
|
17
|
-
jobs = monitor.running_jobs
|
18
|
-
jobs.size.should > 0
|
19
|
-
|
20
|
-
job = jobs.first
|
21
|
-
job_id_str = job.job_id.to_s
|
22
|
-
job_id_str.should match(/^job_/)
|
23
|
-
|
24
|
-
monitor.job_status(job_id_str).class.should == JobStatus
|
25
|
-
end
|
26
|
-
|
27
|
-
it 'kills job' do
|
28
|
-
run_job_async
|
29
|
-
sleep 5
|
30
|
-
monitor = JobMonitor.new
|
31
|
-
job = monitor.running_jobs.first
|
32
|
-
job.should_not be_nil
|
33
|
-
|
34
|
-
job_id_str = job.job_id.to_s
|
35
|
-
monitor.kill_job(job_id_str)
|
36
|
-
|
37
|
-
sleep 60
|
38
|
-
monitor.running_jobs.size.should == 0
|
39
|
-
end
|
40
|
-
|
41
|
-
def run_job_async
|
42
|
-
begin
|
43
|
-
Thread.new do
|
44
|
-
job = JarJob.new
|
45
|
-
job.jar_file = @example_jar
|
46
|
-
job.main_class = 'org.apache.hadoop.examples.ExampleDriver'
|
47
|
-
job.jar_args = %w!pi 1 10!
|
48
|
-
job.run
|
49
|
-
end
|
50
|
-
rescue => e
|
51
|
-
p e
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
data/spec/hjc/util_spec.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
require 'hjc'
|
2
|
-
|
3
|
-
module Hjc
|
4
|
-
describe 'Hjc::util' do
|
5
|
-
it 'can convert string to file' do
|
6
|
-
f = Util.to_temp_file('map.rb', 'mapscript')
|
7
|
-
|
8
|
-
FileTest.exist?(f.path).should be_true
|
9
|
-
File.open(f.path).read.should == 'mapscript'
|
10
|
-
end
|
11
|
-
|
12
|
-
it 'can convert string to file with exec flag' do
|
13
|
-
f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
|
14
|
-
|
15
|
-
FileTest.executable?(f.path).should be_true
|
16
|
-
end
|
17
|
-
|
18
|
-
it 'returns relative path from tempfile' do
|
19
|
-
f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
|
20
|
-
|
21
|
-
filename = File.basename(f.path)
|
22
|
-
Util.rel_path(f).should == "tmp/#{filename}"
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|