hadoop-jruby-connector 0.0.1.2010122601

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1 @@
1
+ now developing.. please stay tuned.
data/Rakefile ADDED
@@ -0,0 +1,26 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ # rspec
4
+ RSpec::Core::RakeTask.new(:spec) do |t|
5
+ t.rspec_opts = ['-c', '-fs']
6
+ end
7
+
8
+ # jeweler
9
+ begin
10
+ require 'jeweler'
11
+ Jeweler::Tasks.new do |gemspec|
12
+ gemspec.name = "hadoop-jruby-connector"
13
+ gemspec.summary = "Hadoop connector by JRuby"
14
+ gemspec.description = "Hadoop connector by JRuby"
15
+ gemspec.email = "fujibee@hapyrus.com"
16
+ gemspec.homepage = "http://github.com/hapyrus/hadoop-jruby-connector"
17
+ gemspec.authors = ["Koichi Fujikawa"]
18
+
19
+ gemspec.files.exclude "spec/**/*"
20
+ end
21
+ Jeweler::GemcutterTasks.new
22
+ rescue LoadError
23
+ puts "Jeweler not available. Install it with: gem install jeweler"
24
+ end
25
+
26
+ task :default => :spec
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1.2010122601
data/lib/hjc.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'java'
2
+
3
+ require 'hjc/util'
4
+ require 'hjc/hadoop_streaming'
5
+ require 'hjc/fs_shell'
6
+
7
+ home = ENV['HADOOP_HOME']
8
+ ['', 'lib', 'contrib/streaming'].each do |path|
9
+ Dir.glob(File.join(home, path, "*.jar")) {|jar| require jar }
10
+ end
11
+ $CLASSPATH << "#{home}/conf"
12
+
13
+ java_import org.apache.hadoop.streaming.StreamJob
14
+ java_import org.apache.hadoop.fs.FsShell
15
+ java_import org.apache.hadoop.conf.Configuration
@@ -0,0 +1,36 @@
1
+ module Hjc
2
+ class FsShell
3
+ def initialize
4
+ @params = []
5
+ end
6
+
7
+ def put(local, remote)
8
+ @cmd = :put
9
+ @params = [local, remote]
10
+ run
11
+ end
12
+
13
+ def get(remote, local)
14
+ @cmd = :get
15
+ @params = [remote, local]
16
+ run
17
+ end
18
+
19
+ def rm(remote)
20
+ @cmd = :rm
21
+ @params = [remote]
22
+ run
23
+ end
24
+
25
+ def run
26
+ java_shell = ::FsShell.new(Configuration.new)
27
+ java_shell.run(build_args)
28
+ end
29
+
30
+ def build_args
31
+ concated_args = ["-#{@cmd}"]
32
+ concated_args.concat @params
33
+ concated_args
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,63 @@
1
+ module Hjc
2
+ class HadoopStreaming
3
+ attr_accessor :input_path, :output_path, :mapper_path, :reducer_path
4
+ attr_accessor :local
5
+ attr_reader :options
6
+
7
+ def initialize
8
+ @files = {}
9
+ @options = {}
10
+ @local = false
11
+ end
12
+
13
+ def run
14
+ java_job = StreamJob.new
15
+ @ret = java_job.run(args)
16
+ end
17
+
18
+ def success?
19
+ @ret == 0 # success if job returned 0
20
+ end
21
+
22
+ def input=(input)
23
+ # input param seems to explain exact path on Hadoop streaming..
24
+ file = Util.to_temp_file('input', input)
25
+ @input_path = file.path
26
+ unless @local # path seems on HDFS
27
+ sh = FsShell.new
28
+ sh.put(file.path, file.path)
29
+ end
30
+ end
31
+
32
+ def mapper=(mapper)
33
+ @files['mapper'] = file = Util.to_temp_file('mapper', mapper, :mod => 0700)
34
+ @mapper_path = File.basename(file.path)
35
+ end
36
+
37
+ def reducer=(reducer)
38
+ @files['reducer'] = file = Util.to_temp_file('reducer', reducer, :mod => 0700)
39
+ @reducer_path = File.basename(file.path)
40
+ end
41
+
42
+ def args
43
+ concated_args = []
44
+ concated_args.concat ['-input', @input_path] if @input_path
45
+ concated_args.concat ['-output' ,@output_path] if @output_path
46
+ concated_args.concat ['-mapper', @mapper_path] if @mapper_path
47
+ concated_args.concat ['-reducer', @reducer_path] if @reducer_path
48
+ concated_args.concat ['-dfs', 'file:///'] if @local
49
+ concated_args.concat ['-jt', 'local'] if @local # no use?
50
+
51
+ @options.each do |k, v|
52
+ concated_args.concat ["-#{k}", v]
53
+ end
54
+
55
+ @files.each do |k, v|
56
+ concated_args.concat ["-file", v.path]
57
+ end
58
+
59
+ puts "args: #{concated_args.join(' ')}"
60
+ concated_args
61
+ end
62
+ end
63
+ end
data/lib/hjc/util.rb ADDED
@@ -0,0 +1,21 @@
1
+ require 'tempfile'
2
+
3
+ module Hjc
4
+ class Util
5
+ TMP_DIR = 'tmp'
6
+
7
+ def self.to_temp_file(filename, body, options={})
8
+ file = Tempfile.new(filename, TMP_DIR)
9
+ file.print body
10
+ file.close
11
+
12
+ unless options.empty?
13
+ if options.keys.include? :mod
14
+ File.chmod(options[:mod], file.path)
15
+ end
16
+ end
17
+
18
+ file
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,19 @@
1
+ require 'hjc'
2
+
3
+ module Hjc
4
+ describe FsShell do
5
+ it 'put and get file to hdfs, and delete' do
6
+ localfile = Util.to_temp_file('localfile', 'fs shell test')
7
+
8
+ shell = FsShell.new
9
+
10
+ shell.put(localfile.path, 'remotefile')
11
+ shell.get('remotefile', 'tmp/returnedfile')
12
+
13
+ File.open('tmp/returnedfile').read.should == 'fs shell test'
14
+
15
+ shell.rm('remotefile')
16
+ File.delete('tmp/returnedfile')
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,119 @@
1
+ require 'hjc'
2
+ require 'fileutils'
3
+
4
+ module Hjc
5
+ describe HadoopStreaming do
6
+ TMP_DIR = 'tmp'
7
+
8
+ before :all do
9
+ @map_script = Util.to_temp_file('map.rb', MAPPER, :mod => 0700)
10
+ @reduce_script = Util.to_temp_file('reduce.rb', REDUCER, :mod => 0700)
11
+ @data_file = Util.to_temp_file('testdata', TEST_DATA)
12
+ @output_dir = TMP_DIR + '/out' + Time.new.to_i.to_s
13
+ end
14
+
15
+ it 'create args for hadoop streaming' do
16
+ job = HadoopStreaming.new
17
+ job.input_path = "input"
18
+ job.output_path = "outdir"
19
+ job.mapper_path = "mapper"
20
+ job.reducer_path = "reducer"
21
+
22
+ job.args.join(" ").should == "-input input -output outdir -mapper mapper -reducer reducer"
23
+ end
24
+
25
+ it 'create args for hadoop streaming with options' do
26
+ job = HadoopStreaming.new
27
+ job.options["dfs"] = "local"
28
+ job.options["jobconf"] = "mapred.map.tasks=1"
29
+
30
+ job.args.join(" ").should == %!-dfs local -jobconf mapred.map.tasks=1!
31
+ end
32
+
33
+ it 'create args for hadoop streaming with files' do
34
+ job = HadoopStreaming.new
35
+ job.input = 'input'
36
+ job.mapper = 'mapper'
37
+ job.reducer = 'reducer'
38
+
39
+ job.args.should include('-file')
40
+ job.args.join(" ").should match(/input/)
41
+ job.args.join(" ").should match(/mapper/)
42
+ job.args.join(" ").should match(/reducer/)
43
+ end
44
+
45
+ it 'can run Hadoop streaming job with path' do
46
+ pending 'path does not work'
47
+ job = HadoopStreaming.new
48
+ job.input_path = "file://" + File.expand_path(@data_file.path)
49
+ job.output_path = @output_dir
50
+ job.mapper_path = @map_script.path
51
+ job.reducer_path = @reduce_script.path
52
+ job.local = true
53
+
54
+ job.run
55
+
56
+ assert_result
57
+
58
+ clean_output
59
+ end
60
+
61
+ it 'can run Hadoop streaming job with string' do
62
+ job = HadoopStreaming.new
63
+ job.input = TEST_DATA
64
+ job.output_path = @output_dir
65
+ job.mapper = MAPPER
66
+ job.reducer = REDUCER
67
+ job.local = true
68
+
69
+ job.run
70
+
71
+ assert_result
72
+
73
+ clean_output
74
+ end
75
+
76
+ def assert_result
77
+ File.open(File.join(@output_dir, 'part-00000')) do |f|
78
+ h = {}
79
+ f.readlines.each do |line|
80
+ a = line.split /\t/
81
+ h[a[0]] = a[1].chomp
82
+ end
83
+ h.should == {'bar' => '1', 'foo' => '3', 'fuga' => '2', 'hoge' => '4'}
84
+ end
85
+ end
86
+
87
+ def clean_output
88
+ FileUtils.rm_r([@output_dir])
89
+ end
90
+
91
+ MAPPER = <<-'EOF'
92
+ #!/usr/bin/env ruby
93
+ ARGF.each do |line|
94
+ line.chomp!
95
+ line.split.each do |word|
96
+ puts "#{word}\t1"
97
+ end
98
+ end
99
+ EOF
100
+
101
+ REDUCER = <<-'EOF'
102
+ #!/usr/bin/env ruby
103
+ count = Hash.new {|h,k| h[k] = 0}
104
+ ARGF.each do |line|
105
+ line.chomp!
106
+ key, value = line.split(/\t/)
107
+ count[key] += 1
108
+ end
109
+ count.each do |k,v|
110
+ puts "#{k}\t#{v}"
111
+ end
112
+ EOF
113
+
114
+ TEST_DATA = <<-'EOF'
115
+ hoge fuga foo hoge foo
116
+ foo bar hoge hoge fuga
117
+ EOF
118
+ end
119
+ end
@@ -0,0 +1,18 @@
1
+ require 'hjc'
2
+
3
+ module Hjc
4
+ describe 'Hjc::util' do
5
+ it 'can convert string to file' do
6
+ f = Util.to_temp_file('map.rb', 'mapscript')
7
+
8
+ FileTest.exist?(f.path).should be_true
9
+ File.open(f.path).read.should == 'mapscript'
10
+ end
11
+
12
+ it 'can convert string to file with exec flag' do
13
+ f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
14
+
15
+ FileTest.executable?(f.path).should be_true
16
+ end
17
+ end
18
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hadoop-jruby-connector
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ - 2010122601
10
+ version: 0.0.1.2010122601
11
+ platform: ruby
12
+ authors:
13
+ - Koichi Fujikawa
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-12-26 00:00:00 +09:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Hadoop connector by JRuby
23
+ email: fujibee@hapyrus.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - README
30
+ files:
31
+ - README
32
+ - Rakefile
33
+ - VERSION
34
+ - lib/hjc.rb
35
+ - lib/hjc/fs_shell.rb
36
+ - lib/hjc/hadoop_streaming.rb
37
+ - lib/hjc/util.rb
38
+ has_rdoc: true
39
+ homepage: http://github.com/hapyrus/hadoop-jruby-connector
40
+ licenses: []
41
+
42
+ post_install_message:
43
+ rdoc_options: []
44
+
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ segments:
52
+ - 0
53
+ version: "0"
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ requirements: []
62
+
63
+ rubyforge_project:
64
+ rubygems_version: 1.3.6
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: Hadoop connector by JRuby
68
+ test_files:
69
+ - spec/hjc/fs_shell_spec.rb
70
+ - spec/hjc/hadoop_streaming_spec.rb
71
+ - spec/hjc/util_spec.rb