hadoop-jruby-connector 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/VERSION +1 -1
- data/lib/hjc/hadoop_streaming.rb +18 -11
- metadata +8 -18
- data/spec/hjc/fs_shell_spec.rb +0 -19
- data/spec/hjc/hadoop_streaming_spec.rb +0 -132
- data/spec/hjc/jar_job_spec.rb +0 -25
- data/spec/hjc/job_monitor_spec.rb +0 -55
- data/spec/hjc/util_spec.rb +0 -25
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.9
|
data/lib/hjc/hadoop_streaming.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Hjc
|
2
2
|
class HadoopStreaming
|
3
|
-
attr_accessor :
|
3
|
+
attr_accessor :input_paths, :output_path, :mapper_path, :reducer_path, :jobconf
|
4
4
|
attr_accessor :local, :debug
|
5
5
|
|
6
6
|
def initialize
|
@@ -19,13 +19,20 @@ module Hjc
|
|
19
19
|
@ret == 0 # success if job returned 0
|
20
20
|
end
|
21
21
|
|
22
|
-
def
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
22
|
+
def input_path=(input) # compatibility
|
23
|
+
self.input = input
|
24
|
+
end
|
25
|
+
|
26
|
+
def input=(*inputs)
|
27
|
+
@input_paths = []
|
28
|
+
inputs.each do |input|
|
29
|
+
# input param seems to explain exact path on Hadoop streaming..
|
30
|
+
file = Util.to_temp_file('input', input)
|
31
|
+
@input_paths << Util.rel_path(file)
|
32
|
+
unless @local # path seems on HDFS
|
33
|
+
sh = FsShell.new
|
34
|
+
sh.put(file.path, Util.rel_path(file))
|
35
|
+
end
|
29
36
|
end
|
30
37
|
end
|
31
38
|
|
@@ -45,7 +52,7 @@ module Hjc
|
|
45
52
|
|
46
53
|
def args
|
47
54
|
concated_args = []
|
48
|
-
concated_args.concat ['-input',
|
55
|
+
concated_args.concat @input_paths.collect{|e| ['-input', e]}.flatten if @input_paths
|
49
56
|
concated_args.concat ['-output' ,@output_path] if @output_path
|
50
57
|
concated_args.concat ['-mapper', @mapper_path] if @mapper_path
|
51
58
|
concated_args.concat ['-reducer', @reducer_path] if @reducer_path
|
@@ -54,11 +61,11 @@ module Hjc
|
|
54
61
|
concated_args.concat ['-debug'] if @debug
|
55
62
|
|
56
63
|
@jobconf.each do |k, v|
|
57
|
-
|
64
|
+
concated_args += ['-jobconf', "#{k}=#{v}"]
|
58
65
|
end
|
59
66
|
|
60
67
|
@files.each do |k, v|
|
61
|
-
|
68
|
+
concated_args.concat ["-file", v.path]
|
62
69
|
end
|
63
70
|
|
64
71
|
puts "args: #{concated_args.join(' ')}" if @debug
|
metadata
CHANGED
@@ -1,12 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hadoop-jruby-connector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
prerelease:
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 0
|
8
|
-
- 8
|
9
|
-
version: 0.0.8
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.9
|
10
6
|
platform: ruby
|
11
7
|
authors:
|
12
8
|
- Koichi Fujikawa
|
@@ -14,7 +10,7 @@ autorequire:
|
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
12
|
|
17
|
-
date: 2011-
|
13
|
+
date: 2011-10-27 00:00:00 -07:00
|
18
14
|
default_executable:
|
19
15
|
dependencies: []
|
20
16
|
|
@@ -46,29 +42,23 @@ rdoc_options: []
|
|
46
42
|
require_paths:
|
47
43
|
- lib
|
48
44
|
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
49
46
|
requirements:
|
50
47
|
- - ">="
|
51
48
|
- !ruby/object:Gem::Version
|
52
|
-
segments:
|
53
|
-
- 0
|
54
49
|
version: "0"
|
55
50
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
56
52
|
requirements:
|
57
53
|
- - ">="
|
58
54
|
- !ruby/object:Gem::Version
|
59
|
-
segments:
|
60
|
-
- 0
|
61
55
|
version: "0"
|
62
56
|
requirements: []
|
63
57
|
|
64
58
|
rubyforge_project:
|
65
|
-
rubygems_version: 1.
|
59
|
+
rubygems_version: 1.5.1
|
66
60
|
signing_key:
|
67
61
|
specification_version: 3
|
68
62
|
summary: Hadoop connector by JRuby
|
69
|
-
test_files:
|
70
|
-
|
71
|
-
- spec/hjc/hadoop_streaming_spec.rb
|
72
|
-
- spec/hjc/jar_job_spec.rb
|
73
|
-
- spec/hjc/job_monitor_spec.rb
|
74
|
-
- spec/hjc/util_spec.rb
|
63
|
+
test_files: []
|
64
|
+
|
data/spec/hjc/fs_shell_spec.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
require 'hjc'
|
2
|
-
|
3
|
-
module Hjc
|
4
|
-
describe FsShell do
|
5
|
-
it 'put and get file to hdfs, and delete' do
|
6
|
-
localfile = Util.to_temp_file('localfile', 'fs shell test')
|
7
|
-
|
8
|
-
shell = FsShell.new
|
9
|
-
|
10
|
-
shell.put(localfile.path, 'remotefile')
|
11
|
-
shell.get('remotefile', 'tmp/returnedfile')
|
12
|
-
|
13
|
-
File.open('tmp/returnedfile').read.should == 'fs shell test'
|
14
|
-
|
15
|
-
shell.rm('remotefile')
|
16
|
-
File.delete('tmp/returnedfile')
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
@@ -1,132 +0,0 @@
|
|
1
|
-
require 'hjc'
|
2
|
-
require 'fileutils'
|
3
|
-
|
4
|
-
module Hjc
|
5
|
-
describe HadoopStreaming do
|
6
|
-
TMP_DIR = 'tmp'
|
7
|
-
|
8
|
-
before :all do
|
9
|
-
@map_script = Util.to_temp_file('map.rb', MAPPER, :mod => 0700)
|
10
|
-
@reduce_script = Util.to_temp_file('reduce.rb', REDUCER, :mod => 0700)
|
11
|
-
@data_file = Util.to_temp_file('testdata', TEST_DATA)
|
12
|
-
@output_dir = TMP_DIR + '/out' + Time.new.to_i.to_s
|
13
|
-
end
|
14
|
-
|
15
|
-
it 'create args for hadoop streaming' do
|
16
|
-
job = HadoopStreaming.new
|
17
|
-
job.input_path = "input"
|
18
|
-
job.output_path = "outdir"
|
19
|
-
job.mapper_path = "mapper"
|
20
|
-
job.reducer_path = "reducer"
|
21
|
-
|
22
|
-
job.args.join(" ").should == "-input input -output outdir -mapper mapper -reducer reducer"
|
23
|
-
end
|
24
|
-
|
25
|
-
it 'create args for hadoop streaming with files' do
|
26
|
-
job = HadoopStreaming.new
|
27
|
-
job.input = 'input'
|
28
|
-
job.mapper = 'mapper'
|
29
|
-
job.reducer = 'reducer'
|
30
|
-
|
31
|
-
job.args.should include('-file')
|
32
|
-
job.args.join(" ").should match(/input/)
|
33
|
-
job.args.join(" ").should match(/mapper/)
|
34
|
-
job.args.join(" ").should match(/reducer/)
|
35
|
-
end
|
36
|
-
|
37
|
-
it 'can run Hadoop streaming job with path' do
|
38
|
-
pending 'path does not work'
|
39
|
-
job = HadoopStreaming.new
|
40
|
-
job.input_path = "file://" + File.expand_path(@data_file.path)
|
41
|
-
job.output_path = @output_dir
|
42
|
-
job.mapper_path = @map_script.path
|
43
|
-
job.reducer_path = @reduce_script.path
|
44
|
-
job.local = true
|
45
|
-
|
46
|
-
job.run
|
47
|
-
|
48
|
-
assert_result
|
49
|
-
|
50
|
-
clean_output
|
51
|
-
end
|
52
|
-
|
53
|
-
it 'can run Hadoop streaming job with string' do
|
54
|
-
pending
|
55
|
-
job = HadoopStreaming.new
|
56
|
-
job.input = TEST_DATA
|
57
|
-
job.output_path = @output_dir
|
58
|
-
job.mapper = MAPPER
|
59
|
-
job.reducer = REDUCER
|
60
|
-
job.local = true
|
61
|
-
|
62
|
-
job.run
|
63
|
-
|
64
|
-
assert_result
|
65
|
-
|
66
|
-
clean_output
|
67
|
-
end
|
68
|
-
|
69
|
-
it 'sets conf params' do
|
70
|
-
job = HadoopStreaming.new
|
71
|
-
job.input_path = "input"
|
72
|
-
job.output_path = "outdir"
|
73
|
-
job.jobconf['hoge'] = "fuga"
|
74
|
-
|
75
|
-
job.args.join(" ").should == %Q!-input input -output outdir -jobconf hoge=fuga!
|
76
|
-
end
|
77
|
-
|
78
|
-
describe '.add_file' do
|
79
|
-
it 'adds -file option' do
|
80
|
-
job = HadoopStreaming.new
|
81
|
-
file = Tempfile.new('additional.txt', 'tmp')
|
82
|
-
job.add_file(file)
|
83
|
-
|
84
|
-
job.args.join(" ").should match("-file")
|
85
|
-
job.args.join(" ").should match("tmp/additional.txt")
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
def assert_result
|
90
|
-
File.open(File.join(@output_dir, 'part-00000')) do |f|
|
91
|
-
h = {}
|
92
|
-
f.readlines.each do |line|
|
93
|
-
a = line.split /\t/
|
94
|
-
h[a[0]] = a[1].chomp
|
95
|
-
end
|
96
|
-
h.should == {'bar' => '1', 'foo' => '3', 'fuga' => '2', 'hoge' => '4'}
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
def clean_output
|
101
|
-
FileUtils.rm_r([@output_dir])
|
102
|
-
end
|
103
|
-
|
104
|
-
MAPPER = <<-'EOF'
|
105
|
-
#!/usr/bin/env ruby
|
106
|
-
ARGF.each do |line|
|
107
|
-
line.chomp!
|
108
|
-
line.split.each do |word|
|
109
|
-
puts "#{word}\t1"
|
110
|
-
end
|
111
|
-
end
|
112
|
-
EOF
|
113
|
-
|
114
|
-
REDUCER = <<-'EOF'
|
115
|
-
#!/usr/bin/env ruby
|
116
|
-
count = Hash.new {|h,k| h[k] = 0}
|
117
|
-
ARGF.each do |line|
|
118
|
-
line.chomp!
|
119
|
-
key, value = line.split(/\t/)
|
120
|
-
count[key] += 1
|
121
|
-
end
|
122
|
-
count.each do |k,v|
|
123
|
-
puts "#{k}\t#{v}"
|
124
|
-
end
|
125
|
-
EOF
|
126
|
-
|
127
|
-
TEST_DATA = <<-'EOF'
|
128
|
-
hoge fuga foo hoge foo
|
129
|
-
foo bar hoge hoge fuga
|
130
|
-
EOF
|
131
|
-
end
|
132
|
-
end
|
data/spec/hjc/jar_job_spec.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
require 'hjc'
|
2
|
-
|
3
|
-
module Hjc
|
4
|
-
describe JarJob do
|
5
|
-
|
6
|
-
before :all do
|
7
|
-
@example_jar = Dir.glob(File.join(Util.hadoop_home, 'hadoop-*-examples.jar')).first
|
8
|
-
end
|
9
|
-
|
10
|
-
it 'creates hadoop jar job args' do
|
11
|
-
job = JarJob.new
|
12
|
-
job.jar_args = %w!pi 1 10!
|
13
|
-
job.args.join(" ").should == 'pi 1 10'
|
14
|
-
end
|
15
|
-
|
16
|
-
it 'can run hadoop jar job' do
|
17
|
-
job = JarJob.new
|
18
|
-
job.jar_file = @example_jar
|
19
|
-
job.main_class = 'org.apache.hadoop.examples.ExampleDriver'
|
20
|
-
job.jar_args = %w!pi 1 10!
|
21
|
-
p job.args
|
22
|
-
job.run
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
@@ -1,55 +0,0 @@
|
|
1
|
-
require 'hjc'
|
2
|
-
|
3
|
-
module Hjc
|
4
|
-
describe JobMonitor do
|
5
|
-
|
6
|
-
before :each do
|
7
|
-
@example_jar = Dir.glob(File.join(Util.hadoop_home, 'hadoop-*-examples.jar')).first
|
8
|
-
s = Hjc::FsShell.new
|
9
|
-
s.rmr('PiEstimator_TMP_3_141592654')
|
10
|
-
end
|
11
|
-
|
12
|
-
it 'monitors hadoop job' do
|
13
|
-
pending
|
14
|
-
run_job_async
|
15
|
-
sleep 5
|
16
|
-
monitor = JobMonitor.new
|
17
|
-
jobs = monitor.running_jobs
|
18
|
-
jobs.size.should > 0
|
19
|
-
|
20
|
-
job = jobs.first
|
21
|
-
job_id_str = job.job_id.to_s
|
22
|
-
job_id_str.should match(/^job_/)
|
23
|
-
|
24
|
-
monitor.job_status(job_id_str).class.should == JobStatus
|
25
|
-
end
|
26
|
-
|
27
|
-
it 'kills job' do
|
28
|
-
run_job_async
|
29
|
-
sleep 5
|
30
|
-
monitor = JobMonitor.new
|
31
|
-
job = monitor.running_jobs.first
|
32
|
-
job.should_not be_nil
|
33
|
-
|
34
|
-
job_id_str = job.job_id.to_s
|
35
|
-
monitor.kill_job(job_id_str)
|
36
|
-
|
37
|
-
sleep 60
|
38
|
-
monitor.running_jobs.size.should == 0
|
39
|
-
end
|
40
|
-
|
41
|
-
def run_job_async
|
42
|
-
begin
|
43
|
-
Thread.new do
|
44
|
-
job = JarJob.new
|
45
|
-
job.jar_file = @example_jar
|
46
|
-
job.main_class = 'org.apache.hadoop.examples.ExampleDriver'
|
47
|
-
job.jar_args = %w!pi 1 10!
|
48
|
-
job.run
|
49
|
-
end
|
50
|
-
rescue => e
|
51
|
-
p e
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
data/spec/hjc/util_spec.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
require 'hjc'
|
2
|
-
|
3
|
-
module Hjc
|
4
|
-
describe 'Hjc::util' do
|
5
|
-
it 'can convert string to file' do
|
6
|
-
f = Util.to_temp_file('map.rb', 'mapscript')
|
7
|
-
|
8
|
-
FileTest.exist?(f.path).should be_true
|
9
|
-
File.open(f.path).read.should == 'mapscript'
|
10
|
-
end
|
11
|
-
|
12
|
-
it 'can convert string to file with exec flag' do
|
13
|
-
f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
|
14
|
-
|
15
|
-
FileTest.executable?(f.path).should be_true
|
16
|
-
end
|
17
|
-
|
18
|
-
it 'returns relative path from tempfile' do
|
19
|
-
f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
|
20
|
-
|
21
|
-
filename = File.basename(f.path)
|
22
|
-
Util.rel_path(f).should == "tmp/#{filename}"
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|