hadoop-jruby-connector 0.0.1.2010122601
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +1 -0
- data/Rakefile +26 -0
- data/VERSION +1 -0
- data/lib/hjc.rb +15 -0
- data/lib/hjc/fs_shell.rb +36 -0
- data/lib/hjc/hadoop_streaming.rb +63 -0
- data/lib/hjc/util.rb +21 -0
- data/spec/hjc/fs_shell_spec.rb +19 -0
- data/spec/hjc/hadoop_streaming_spec.rb +119 -0
- data/spec/hjc/util_spec.rb +18 -0
- metadata +71 -0
data/README
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
now developing.. please stay tuned.
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rspec/core/rake_task'
|
2
|
+
|
3
|
+
# rspec
|
4
|
+
RSpec::Core::RakeTask.new(:spec) do |t|
|
5
|
+
t.rspec_opts = ['-c', '-fs']
|
6
|
+
end
|
7
|
+
|
8
|
+
# jeweler
|
9
|
+
begin
|
10
|
+
require 'jeweler'
|
11
|
+
Jeweler::Tasks.new do |gemspec|
|
12
|
+
gemspec.name = "hadoop-jruby-connector"
|
13
|
+
gemspec.summary = "Hadoop connector by JRuby"
|
14
|
+
gemspec.description = "Hadoop connector by JRuby"
|
15
|
+
gemspec.email = "fujibee@hapyrus.com"
|
16
|
+
gemspec.homepage = "http://github.com/hapyrus/hadoop-jruby-connector"
|
17
|
+
gemspec.authors = ["Koichi Fujikawa"]
|
18
|
+
|
19
|
+
gemspec.files.exclude "spec/**/*"
|
20
|
+
end
|
21
|
+
Jeweler::GemcutterTasks.new
|
22
|
+
rescue LoadError
|
23
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
24
|
+
end
|
25
|
+
|
26
|
+
task :default => :spec
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1.2010122601
|
data/lib/hjc.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'java'
|
2
|
+
|
3
|
+
require 'hjc/util'
|
4
|
+
require 'hjc/hadoop_streaming'
|
5
|
+
require 'hjc/fs_shell'
|
6
|
+
|
7
|
+
home = ENV['HADOOP_HOME']
|
8
|
+
['', 'lib', 'contrib/streaming'].each do |path|
|
9
|
+
Dir.glob(File.join(home, path, "*.jar")) {|jar| require jar }
|
10
|
+
end
|
11
|
+
$CLASSPATH << "#{home}/conf"
|
12
|
+
|
13
|
+
java_import org.apache.hadoop.streaming.StreamJob
|
14
|
+
java_import org.apache.hadoop.fs.FsShell
|
15
|
+
java_import org.apache.hadoop.conf.Configuration
|
data/lib/hjc/fs_shell.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
module Hjc
|
2
|
+
class FsShell
|
3
|
+
def initialize
|
4
|
+
@params = []
|
5
|
+
end
|
6
|
+
|
7
|
+
def put(local, remote)
|
8
|
+
@cmd = :put
|
9
|
+
@params = [local, remote]
|
10
|
+
run
|
11
|
+
end
|
12
|
+
|
13
|
+
def get(remote, local)
|
14
|
+
@cmd = :get
|
15
|
+
@params = [remote, local]
|
16
|
+
run
|
17
|
+
end
|
18
|
+
|
19
|
+
def rm(remote)
|
20
|
+
@cmd = :rm
|
21
|
+
@params = [remote]
|
22
|
+
run
|
23
|
+
end
|
24
|
+
|
25
|
+
def run
|
26
|
+
java_shell = ::FsShell.new(Configuration.new)
|
27
|
+
java_shell.run(build_args)
|
28
|
+
end
|
29
|
+
|
30
|
+
def build_args
|
31
|
+
concated_args = ["-#{@cmd}"]
|
32
|
+
concated_args.concat @params
|
33
|
+
concated_args
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Hjc
|
2
|
+
class HadoopStreaming
|
3
|
+
attr_accessor :input_path, :output_path, :mapper_path, :reducer_path
|
4
|
+
attr_accessor :local
|
5
|
+
attr_reader :options
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@files = {}
|
9
|
+
@options = {}
|
10
|
+
@local = false
|
11
|
+
end
|
12
|
+
|
13
|
+
def run
|
14
|
+
java_job = StreamJob.new
|
15
|
+
@ret = java_job.run(args)
|
16
|
+
end
|
17
|
+
|
18
|
+
def success?
|
19
|
+
@ret == 0 # success if job returned 0
|
20
|
+
end
|
21
|
+
|
22
|
+
def input=(input)
|
23
|
+
# input param seems to explain exact path on Hadoop streaming..
|
24
|
+
file = Util.to_temp_file('input', input)
|
25
|
+
@input_path = file.path
|
26
|
+
unless @local # path seems on HDFS
|
27
|
+
sh = FsShell.new
|
28
|
+
sh.put(file.path, file.path)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def mapper=(mapper)
|
33
|
+
@files['mapper'] = file = Util.to_temp_file('mapper', mapper, :mod => 0700)
|
34
|
+
@mapper_path = File.basename(file.path)
|
35
|
+
end
|
36
|
+
|
37
|
+
def reducer=(reducer)
|
38
|
+
@files['reducer'] = file = Util.to_temp_file('reducer', reducer, :mod => 0700)
|
39
|
+
@reducer_path = File.basename(file.path)
|
40
|
+
end
|
41
|
+
|
42
|
+
def args
|
43
|
+
concated_args = []
|
44
|
+
concated_args.concat ['-input', @input_path] if @input_path
|
45
|
+
concated_args.concat ['-output' ,@output_path] if @output_path
|
46
|
+
concated_args.concat ['-mapper', @mapper_path] if @mapper_path
|
47
|
+
concated_args.concat ['-reducer', @reducer_path] if @reducer_path
|
48
|
+
concated_args.concat ['-dfs', 'file:///'] if @local
|
49
|
+
concated_args.concat ['-jt', 'local'] if @local # no use?
|
50
|
+
|
51
|
+
@options.each do |k, v|
|
52
|
+
concated_args.concat ["-#{k}", v]
|
53
|
+
end
|
54
|
+
|
55
|
+
@files.each do |k, v|
|
56
|
+
concated_args.concat ["-file", v.path]
|
57
|
+
end
|
58
|
+
|
59
|
+
puts "args: #{concated_args.join(' ')}"
|
60
|
+
concated_args
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
data/lib/hjc/util.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'tempfile'
|
2
|
+
|
3
|
+
module Hjc
|
4
|
+
class Util
|
5
|
+
TMP_DIR = 'tmp'
|
6
|
+
|
7
|
+
def self.to_temp_file(filename, body, options={})
|
8
|
+
file = Tempfile.new(filename, TMP_DIR)
|
9
|
+
file.print body
|
10
|
+
file.close
|
11
|
+
|
12
|
+
unless options.empty?
|
13
|
+
if options.keys.include? :mod
|
14
|
+
File.chmod(options[:mod], file.path)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
file
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'hjc'
|
2
|
+
|
3
|
+
module Hjc
|
4
|
+
describe FsShell do
|
5
|
+
it 'put and get file to hdfs, and delete' do
|
6
|
+
localfile = Util.to_temp_file('localfile', 'fs shell test')
|
7
|
+
|
8
|
+
shell = FsShell.new
|
9
|
+
|
10
|
+
shell.put(localfile.path, 'remotefile')
|
11
|
+
shell.get('remotefile', 'tmp/returnedfile')
|
12
|
+
|
13
|
+
File.open('tmp/returnedfile').read.should == 'fs shell test'
|
14
|
+
|
15
|
+
shell.rm('remotefile')
|
16
|
+
File.delete('tmp/returnedfile')
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'hjc'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
module Hjc
|
5
|
+
describe HadoopStreaming do
|
6
|
+
TMP_DIR = 'tmp'
|
7
|
+
|
8
|
+
before :all do
|
9
|
+
@map_script = Util.to_temp_file('map.rb', MAPPER, :mod => 0700)
|
10
|
+
@reduce_script = Util.to_temp_file('reduce.rb', REDUCER, :mod => 0700)
|
11
|
+
@data_file = Util.to_temp_file('testdata', TEST_DATA)
|
12
|
+
@output_dir = TMP_DIR + '/out' + Time.new.to_i.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'create args for hadoop streaming' do
|
16
|
+
job = HadoopStreaming.new
|
17
|
+
job.input_path = "input"
|
18
|
+
job.output_path = "outdir"
|
19
|
+
job.mapper_path = "mapper"
|
20
|
+
job.reducer_path = "reducer"
|
21
|
+
|
22
|
+
job.args.join(" ").should == "-input input -output outdir -mapper mapper -reducer reducer"
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'create args for hadoop streaming with options' do
|
26
|
+
job = HadoopStreaming.new
|
27
|
+
job.options["dfs"] = "local"
|
28
|
+
job.options["jobconf"] = "mapred.map.tasks=1"
|
29
|
+
|
30
|
+
job.args.join(" ").should == %!-dfs local -jobconf mapred.map.tasks=1!
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'create args for hadoop streaming with files' do
|
34
|
+
job = HadoopStreaming.new
|
35
|
+
job.input = 'input'
|
36
|
+
job.mapper = 'mapper'
|
37
|
+
job.reducer = 'reducer'
|
38
|
+
|
39
|
+
job.args.should include('-file')
|
40
|
+
job.args.join(" ").should match(/input/)
|
41
|
+
job.args.join(" ").should match(/mapper/)
|
42
|
+
job.args.join(" ").should match(/reducer/)
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'can run Hadoop streaming job with path' do
|
46
|
+
pending 'path does not work'
|
47
|
+
job = HadoopStreaming.new
|
48
|
+
job.input_path = "file://" + File.expand_path(@data_file.path)
|
49
|
+
job.output_path = @output_dir
|
50
|
+
job.mapper_path = @map_script.path
|
51
|
+
job.reducer_path = @reduce_script.path
|
52
|
+
job.local = true
|
53
|
+
|
54
|
+
job.run
|
55
|
+
|
56
|
+
assert_result
|
57
|
+
|
58
|
+
clean_output
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'can run Hadoop streaming job with string' do
|
62
|
+
job = HadoopStreaming.new
|
63
|
+
job.input = TEST_DATA
|
64
|
+
job.output_path = @output_dir
|
65
|
+
job.mapper = MAPPER
|
66
|
+
job.reducer = REDUCER
|
67
|
+
job.local = true
|
68
|
+
|
69
|
+
job.run
|
70
|
+
|
71
|
+
assert_result
|
72
|
+
|
73
|
+
clean_output
|
74
|
+
end
|
75
|
+
|
76
|
+
def assert_result
|
77
|
+
File.open(File.join(@output_dir, 'part-00000')) do |f|
|
78
|
+
h = {}
|
79
|
+
f.readlines.each do |line|
|
80
|
+
a = line.split /\t/
|
81
|
+
h[a[0]] = a[1].chomp
|
82
|
+
end
|
83
|
+
h.should == {'bar' => '1', 'foo' => '3', 'fuga' => '2', 'hoge' => '4'}
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def clean_output
|
88
|
+
FileUtils.rm_r([@output_dir])
|
89
|
+
end
|
90
|
+
|
91
|
+
MAPPER = <<-'EOF'
|
92
|
+
#!/usr/bin/env ruby
|
93
|
+
ARGF.each do |line|
|
94
|
+
line.chomp!
|
95
|
+
line.split.each do |word|
|
96
|
+
puts "#{word}\t1"
|
97
|
+
end
|
98
|
+
end
|
99
|
+
EOF
|
100
|
+
|
101
|
+
REDUCER = <<-'EOF'
|
102
|
+
#!/usr/bin/env ruby
|
103
|
+
count = Hash.new {|h,k| h[k] = 0}
|
104
|
+
ARGF.each do |line|
|
105
|
+
line.chomp!
|
106
|
+
key, value = line.split(/\t/)
|
107
|
+
count[key] += 1
|
108
|
+
end
|
109
|
+
count.each do |k,v|
|
110
|
+
puts "#{k}\t#{v}"
|
111
|
+
end
|
112
|
+
EOF
|
113
|
+
|
114
|
+
TEST_DATA = <<-'EOF'
|
115
|
+
hoge fuga foo hoge foo
|
116
|
+
foo bar hoge hoge fuga
|
117
|
+
EOF
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'hjc'
|
2
|
+
|
3
|
+
module Hjc
|
4
|
+
describe 'Hjc::util' do
|
5
|
+
it 'can convert string to file' do
|
6
|
+
f = Util.to_temp_file('map.rb', 'mapscript')
|
7
|
+
|
8
|
+
FileTest.exist?(f.path).should be_true
|
9
|
+
File.open(f.path).read.should == 'mapscript'
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'can convert string to file with exec flag' do
|
13
|
+
f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
|
14
|
+
|
15
|
+
FileTest.executable?(f.path).should be_true
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hadoop-jruby-connector
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 2010122601
|
10
|
+
version: 0.0.1.2010122601
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Koichi Fujikawa
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-12-26 00:00:00 +09:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: Hadoop connector by JRuby
|
23
|
+
email: fujibee@hapyrus.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files:
|
29
|
+
- README
|
30
|
+
files:
|
31
|
+
- README
|
32
|
+
- Rakefile
|
33
|
+
- VERSION
|
34
|
+
- lib/hjc.rb
|
35
|
+
- lib/hjc/fs_shell.rb
|
36
|
+
- lib/hjc/hadoop_streaming.rb
|
37
|
+
- lib/hjc/util.rb
|
38
|
+
has_rdoc: true
|
39
|
+
homepage: http://github.com/hapyrus/hadoop-jruby-connector
|
40
|
+
licenses: []
|
41
|
+
|
42
|
+
post_install_message:
|
43
|
+
rdoc_options: []
|
44
|
+
|
45
|
+
require_paths:
|
46
|
+
- lib
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - ">="
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
segments:
|
52
|
+
- 0
|
53
|
+
version: "0"
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
version: "0"
|
61
|
+
requirements: []
|
62
|
+
|
63
|
+
rubyforge_project:
|
64
|
+
rubygems_version: 1.3.6
|
65
|
+
signing_key:
|
66
|
+
specification_version: 3
|
67
|
+
summary: Hadoop connector by JRuby
|
68
|
+
test_files:
|
69
|
+
- spec/hjc/fs_shell_spec.rb
|
70
|
+
- spec/hjc/hadoop_streaming_spec.rb
|
71
|
+
- spec/hjc/util_spec.rb
|