hadoop-jruby-connector 0.0.1.2010122601
Sign up to get free protection for your applications and to get access to all the features.
- data/README +1 -0
- data/Rakefile +26 -0
- data/VERSION +1 -0
- data/lib/hjc.rb +15 -0
- data/lib/hjc/fs_shell.rb +36 -0
- data/lib/hjc/hadoop_streaming.rb +63 -0
- data/lib/hjc/util.rb +21 -0
- data/spec/hjc/fs_shell_spec.rb +19 -0
- data/spec/hjc/hadoop_streaming_spec.rb +119 -0
- data/spec/hjc/util_spec.rb +18 -0
- metadata +71 -0
data/README
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
now developing.. please stay tuned.
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rspec/core/rake_task'
|
2
|
+
|
3
|
+
# rspec
|
4
|
+
RSpec::Core::RakeTask.new(:spec) do |t|
|
5
|
+
t.rspec_opts = ['-c', '-fs']
|
6
|
+
end
|
7
|
+
|
8
|
+
# jeweler
|
9
|
+
begin
|
10
|
+
require 'jeweler'
|
11
|
+
Jeweler::Tasks.new do |gemspec|
|
12
|
+
gemspec.name = "hadoop-jruby-connector"
|
13
|
+
gemspec.summary = "Hadoop connector by JRuby"
|
14
|
+
gemspec.description = "Hadoop connector by JRuby"
|
15
|
+
gemspec.email = "fujibee@hapyrus.com"
|
16
|
+
gemspec.homepage = "http://github.com/hapyrus/hadoop-jruby-connector"
|
17
|
+
gemspec.authors = ["Koichi Fujikawa"]
|
18
|
+
|
19
|
+
gemspec.files.exclude "spec/**/*"
|
20
|
+
end
|
21
|
+
Jeweler::GemcutterTasks.new
|
22
|
+
rescue LoadError
|
23
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
24
|
+
end
|
25
|
+
|
26
|
+
task :default => :spec
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1.2010122601
|
data/lib/hjc.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'java'
|
2
|
+
|
3
|
+
require 'hjc/util'
|
4
|
+
require 'hjc/hadoop_streaming'
|
5
|
+
require 'hjc/fs_shell'
|
6
|
+
|
7
|
+
home = ENV['HADOOP_HOME']
|
8
|
+
['', 'lib', 'contrib/streaming'].each do |path|
|
9
|
+
Dir.glob(File.join(home, path, "*.jar")) {|jar| require jar }
|
10
|
+
end
|
11
|
+
$CLASSPATH << "#{home}/conf"
|
12
|
+
|
13
|
+
java_import org.apache.hadoop.streaming.StreamJob
|
14
|
+
java_import org.apache.hadoop.fs.FsShell
|
15
|
+
java_import org.apache.hadoop.conf.Configuration
|
data/lib/hjc/fs_shell.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
module Hjc
|
2
|
+
class FsShell
|
3
|
+
def initialize
|
4
|
+
@params = []
|
5
|
+
end
|
6
|
+
|
7
|
+
def put(local, remote)
|
8
|
+
@cmd = :put
|
9
|
+
@params = [local, remote]
|
10
|
+
run
|
11
|
+
end
|
12
|
+
|
13
|
+
def get(remote, local)
|
14
|
+
@cmd = :get
|
15
|
+
@params = [remote, local]
|
16
|
+
run
|
17
|
+
end
|
18
|
+
|
19
|
+
def rm(remote)
|
20
|
+
@cmd = :rm
|
21
|
+
@params = [remote]
|
22
|
+
run
|
23
|
+
end
|
24
|
+
|
25
|
+
def run
|
26
|
+
java_shell = ::FsShell.new(Configuration.new)
|
27
|
+
java_shell.run(build_args)
|
28
|
+
end
|
29
|
+
|
30
|
+
def build_args
|
31
|
+
concated_args = ["-#{@cmd}"]
|
32
|
+
concated_args.concat @params
|
33
|
+
concated_args
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Hjc
|
2
|
+
class HadoopStreaming
|
3
|
+
attr_accessor :input_path, :output_path, :mapper_path, :reducer_path
|
4
|
+
attr_accessor :local
|
5
|
+
attr_reader :options
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@files = {}
|
9
|
+
@options = {}
|
10
|
+
@local = false
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
java_job = StreamJob.new
|
15
|
+
@ret = java_job.run(args)
|
16
|
+
end
|
17
|
+
|
18
|
+
def success?
|
19
|
+
@ret == 0 # success if job returned 0
|
20
|
+
end
|
21
|
+
|
22
|
+
def input=(input)
|
23
|
+
# input param seems to explain exact path on Hadoop streaming..
|
24
|
+
file = Util.to_temp_file('input', input)
|
25
|
+
@input_path = file.path
|
26
|
+
unless @local # path seems on HDFS
|
27
|
+
sh = FsShell.new
|
28
|
+
sh.put(file.path, file.path)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def mapper=(mapper)
|
33
|
+
@files['mapper'] = file = Util.to_temp_file('mapper', mapper, :mod => 0700)
|
34
|
+
@mapper_path = File.basename(file.path)
|
35
|
+
end
|
36
|
+
|
37
|
+
def reducer=(reducer)
|
38
|
+
@files['reducer'] = file = Util.to_temp_file('reducer', reducer, :mod => 0700)
|
39
|
+
@reducer_path = File.basename(file.path)
|
40
|
+
end
|
41
|
+
|
42
|
+
def args
|
43
|
+
concated_args = []
|
44
|
+
concated_args.concat ['-input', @input_path] if @input_path
|
45
|
+
concated_args.concat ['-output' ,@output_path] if @output_path
|
46
|
+
concated_args.concat ['-mapper', @mapper_path] if @mapper_path
|
47
|
+
concated_args.concat ['-reducer', @reducer_path] if @reducer_path
|
48
|
+
concated_args.concat ['-dfs', 'file:///'] if @local
|
49
|
+
concated_args.concat ['-jt', 'local'] if @local # no use?
|
50
|
+
|
51
|
+
@options.each do |k, v|
|
52
|
+
concated_args.concat ["-#{k}", v]
|
53
|
+
end
|
54
|
+
|
55
|
+
@files.each do |k, v|
|
56
|
+
concated_args.concat ["-file", v.path]
|
57
|
+
end
|
58
|
+
|
59
|
+
puts "args: #{concated_args.join(' ')}"
|
60
|
+
concated_args
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
data/lib/hjc/util.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
|
3
|
+
module Hjc
|
4
|
+
class Util
|
5
|
+
TMP_DIR = 'tmp'
|
6
|
+
|
7
|
+
def self.to_temp_file(filename, body, options={})
|
8
|
+
file = Tempfile.new(filename, TMP_DIR)
|
9
|
+
file.print body
|
10
|
+
file.close
|
11
|
+
|
12
|
+
unless options.empty?
|
13
|
+
if options.keys.include? :mod
|
14
|
+
File.chmod(options[:mod], file.path)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
file
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'hjc'
|
2
|
+
|
3
|
+
module Hjc
|
4
|
+
describe FsShell do
|
5
|
+
it 'put and get file to hdfs, and delete' do
|
6
|
+
localfile = Util.to_temp_file('localfile', 'fs shell test')
|
7
|
+
|
8
|
+
shell = FsShell.new
|
9
|
+
|
10
|
+
shell.put(localfile.path, 'remotefile')
|
11
|
+
shell.get('remotefile', 'tmp/returnedfile')
|
12
|
+
|
13
|
+
File.open('tmp/returnedfile').read.should == 'fs shell test'
|
14
|
+
|
15
|
+
shell.rm('remotefile')
|
16
|
+
File.delete('tmp/returnedfile')
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'hjc'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Hjc
|
5
|
+
describe HadoopStreaming do
|
6
|
+
TMP_DIR = 'tmp'
|
7
|
+
|
8
|
+
before :all do
|
9
|
+
@map_script = Util.to_temp_file('map.rb', MAPPER, :mod => 0700)
|
10
|
+
@reduce_script = Util.to_temp_file('reduce.rb', REDUCER, :mod => 0700)
|
11
|
+
@data_file = Util.to_temp_file('testdata', TEST_DATA)
|
12
|
+
@output_dir = TMP_DIR + '/out' + Time.new.to_i.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'create args for hadoop streaming' do
|
16
|
+
job = HadoopStreaming.new
|
17
|
+
job.input_path = "input"
|
18
|
+
job.output_path = "outdir"
|
19
|
+
job.mapper_path = "mapper"
|
20
|
+
job.reducer_path = "reducer"
|
21
|
+
|
22
|
+
job.args.join(" ").should == "-input input -output outdir -mapper mapper -reducer reducer"
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'create args for hadoop streaming with options' do
|
26
|
+
job = HadoopStreaming.new
|
27
|
+
job.options["dfs"] = "local"
|
28
|
+
job.options["jobconf"] = "mapred.map.tasks=1"
|
29
|
+
|
30
|
+
job.args.join(" ").should == %!-dfs local -jobconf mapred.map.tasks=1!
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'create args for hadoop streaming with files' do
|
34
|
+
job = HadoopStreaming.new
|
35
|
+
job.input = 'input'
|
36
|
+
job.mapper = 'mapper'
|
37
|
+
job.reducer = 'reducer'
|
38
|
+
|
39
|
+
job.args.should include('-file')
|
40
|
+
job.args.join(" ").should match(/input/)
|
41
|
+
job.args.join(" ").should match(/mapper/)
|
42
|
+
job.args.join(" ").should match(/reducer/)
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'can run Hadoop streaming job with path' do
|
46
|
+
pending 'path does not work'
|
47
|
+
job = HadoopStreaming.new
|
48
|
+
job.input_path = "file://" + File.expand_path(@data_file.path)
|
49
|
+
job.output_path = @output_dir
|
50
|
+
job.mapper_path = @map_script.path
|
51
|
+
job.reducer_path = @reduce_script.path
|
52
|
+
job.local = true
|
53
|
+
|
54
|
+
job.run
|
55
|
+
|
56
|
+
assert_result
|
57
|
+
|
58
|
+
clean_output
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'can run Hadoop streaming job with string' do
|
62
|
+
job = HadoopStreaming.new
|
63
|
+
job.input = TEST_DATA
|
64
|
+
job.output_path = @output_dir
|
65
|
+
job.mapper = MAPPER
|
66
|
+
job.reducer = REDUCER
|
67
|
+
job.local = true
|
68
|
+
|
69
|
+
job.run
|
70
|
+
|
71
|
+
assert_result
|
72
|
+
|
73
|
+
clean_output
|
74
|
+
end
|
75
|
+
|
76
|
+
def assert_result
|
77
|
+
File.open(File.join(@output_dir, 'part-00000')) do |f|
|
78
|
+
h = {}
|
79
|
+
f.readlines.each do |line|
|
80
|
+
a = line.split /\t/
|
81
|
+
h[a[0]] = a[1].chomp
|
82
|
+
end
|
83
|
+
h.should == {'bar' => '1', 'foo' => '3', 'fuga' => '2', 'hoge' => '4'}
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def clean_output
|
88
|
+
FileUtils.rm_r([@output_dir])
|
89
|
+
end
|
90
|
+
|
91
|
+
MAPPER = <<-'EOF'
|
92
|
+
#!/usr/bin/env ruby
|
93
|
+
ARGF.each do |line|
|
94
|
+
line.chomp!
|
95
|
+
line.split.each do |word|
|
96
|
+
puts "#{word}\t1"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
EOF
|
100
|
+
|
101
|
+
REDUCER = <<-'EOF'
|
102
|
+
#!/usr/bin/env ruby
|
103
|
+
count = Hash.new {|h,k| h[k] = 0}
|
104
|
+
ARGF.each do |line|
|
105
|
+
line.chomp!
|
106
|
+
key, value = line.split(/\t/)
|
107
|
+
count[key] += 1
|
108
|
+
end
|
109
|
+
count.each do |k,v|
|
110
|
+
puts "#{k}\t#{v}"
|
111
|
+
end
|
112
|
+
EOF
|
113
|
+
|
114
|
+
TEST_DATA = <<-'EOF'
|
115
|
+
hoge fuga foo hoge foo
|
116
|
+
foo bar hoge hoge fuga
|
117
|
+
EOF
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'hjc'
|
2
|
+
|
3
|
+
module Hjc
|
4
|
+
describe 'Hjc::util' do
|
5
|
+
it 'can convert string to file' do
|
6
|
+
f = Util.to_temp_file('map.rb', 'mapscript')
|
7
|
+
|
8
|
+
FileTest.exist?(f.path).should be_true
|
9
|
+
File.open(f.path).read.should == 'mapscript'
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'can convert string to file with exec flag' do
|
13
|
+
f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
|
14
|
+
|
15
|
+
FileTest.executable?(f.path).should be_true
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hadoop-jruby-connector
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 2010122601
|
10
|
+
version: 0.0.1.2010122601
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Koichi Fujikawa
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-12-26 00:00:00 +09:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: Hadoop connector by JRuby
|
23
|
+
email: fujibee@hapyrus.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files:
|
29
|
+
- README
|
30
|
+
files:
|
31
|
+
- README
|
32
|
+
- Rakefile
|
33
|
+
- VERSION
|
34
|
+
- lib/hjc.rb
|
35
|
+
- lib/hjc/fs_shell.rb
|
36
|
+
- lib/hjc/hadoop_streaming.rb
|
37
|
+
- lib/hjc/util.rb
|
38
|
+
has_rdoc: true
|
39
|
+
homepage: http://github.com/hapyrus/hadoop-jruby-connector
|
40
|
+
licenses: []
|
41
|
+
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options: []
|
44
|
+
|
45
|
+
require_paths:
|
46
|
+
- lib
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - ">="
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
segments:
|
52
|
+
- 0
|
53
|
+
version: "0"
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
version: "0"
|
61
|
+
requirements: []
|
62
|
+
|
63
|
+
rubyforge_project:
|
64
|
+
rubygems_version: 1.3.6
|
65
|
+
signing_key:
|
66
|
+
specification_version: 3
|
67
|
+
summary: Hadoop connector by JRuby
|
68
|
+
test_files:
|
69
|
+
- spec/hjc/fs_shell_spec.rb
|
70
|
+
- spec/hjc/hadoop_streaming_spec.rb
|
71
|
+
- spec/hjc/util_spec.rb
|