hadoop-jruby-connector 0.0.1.2010122601

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1 @@
1
+ now developing.. please stay tuned.
data/Rakefile ADDED
@@ -0,0 +1,26 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ # rspec
4
+ RSpec::Core::RakeTask.new(:spec) do |t|
5
+ t.rspec_opts = ['-c', '-fs']
6
+ end
7
+
8
+ # jeweler
9
+ begin
10
+ require 'jeweler'
11
+ Jeweler::Tasks.new do |gemspec|
12
+ gemspec.name = "hadoop-jruby-connector"
13
+ gemspec.summary = "Hadoop connector by JRuby"
14
+ gemspec.description = "Hadoop connector by JRuby"
15
+ gemspec.email = "fujibee@hapyrus.com"
16
+ gemspec.homepage = "http://github.com/hapyrus/hadoop-jruby-connector"
17
+ gemspec.authors = ["Koichi Fujikawa"]
18
+
19
+ gemspec.files.exclude "spec/**/*"
20
+ end
21
+ Jeweler::GemcutterTasks.new
22
+ rescue LoadError
23
+ puts "Jeweler not available. Install it with: gem install jeweler"
24
+ end
25
+
26
+ task :default => :spec
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1.2010122601
data/lib/hjc.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'java'
2
+
3
+ require 'hjc/util'
4
+ require 'hjc/hadoop_streaming'
5
+ require 'hjc/fs_shell'
6
+
7
+ home = ENV['HADOOP_HOME']
8
+ ['', 'lib', 'contrib/streaming'].each do |path|
9
+ Dir.glob(File.join(home, path, "*.jar")) {|jar| require jar }
10
+ end
11
+ $CLASSPATH << "#{home}/conf"
12
+
13
+ java_import org.apache.hadoop.streaming.StreamJob
14
+ java_import org.apache.hadoop.fs.FsShell
15
+ java_import org.apache.hadoop.conf.Configuration
@@ -0,0 +1,36 @@
1
+ module Hjc
2
+ class FsShell
3
+ def initialize
4
+ @params = []
5
+ end
6
+
7
+ def put(local, remote)
8
+ @cmd = :put
9
+ @params = [local, remote]
10
+ run
11
+ end
12
+
13
+ def get(remote, local)
14
+ @cmd = :get
15
+ @params = [remote, local]
16
+ run
17
+ end
18
+
19
+ def rm(remote)
20
+ @cmd = :rm
21
+ @params = [remote]
22
+ run
23
+ end
24
+
25
+ def run
26
+ java_shell = ::FsShell.new(Configuration.new)
27
+ java_shell.run(build_args)
28
+ end
29
+
30
+ def build_args
31
+ concated_args = ["-#{@cmd}"]
32
+ concated_args.concat @params
33
+ concated_args
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,63 @@
1
+ module Hjc
2
+ class HadoopStreaming
3
+ attr_accessor :input_path, :output_path, :mapper_path, :reducer_path
4
+ attr_accessor :local
5
+ attr_reader :options
6
+
7
+ def initialize
8
+ @files = {}
9
+ @options = {}
10
+ @local = false
11
+ end
12
+
13
+ def run
14
+ java_job = StreamJob.new
15
+ @ret = java_job.run(args)
16
+ end
17
+
18
+ def success?
19
+ @ret == 0 # success if job returned 0
20
+ end
21
+
22
+ def input=(input)
23
+ # input param seems to explain exact path on Hadoop streaming..
24
+ file = Util.to_temp_file('input', input)
25
+ @input_path = file.path
26
+ unless @local # path seems on HDFS
27
+ sh = FsShell.new
28
+ sh.put(file.path, file.path)
29
+ end
30
+ end
31
+
32
+ def mapper=(mapper)
33
+ @files['mapper'] = file = Util.to_temp_file('mapper', mapper, :mod => 0700)
34
+ @mapper_path = File.basename(file.path)
35
+ end
36
+
37
+ def reducer=(reducer)
38
+ @files['reducer'] = file = Util.to_temp_file('reducer', reducer, :mod => 0700)
39
+ @reducer_path = File.basename(file.path)
40
+ end
41
+
42
+ def args
43
+ concated_args = []
44
+ concated_args.concat ['-input', @input_path] if @input_path
45
+ concated_args.concat ['-output' ,@output_path] if @output_path
46
+ concated_args.concat ['-mapper', @mapper_path] if @mapper_path
47
+ concated_args.concat ['-reducer', @reducer_path] if @reducer_path
48
+ concated_args.concat ['-dfs', 'file:///'] if @local
49
+ concated_args.concat ['-jt', 'local'] if @local # no use?
50
+
51
+ @options.each do |k, v|
52
+ concated_args.concat ["-#{k}", v]
53
+ end
54
+
55
+ @files.each do |k, v|
56
+ concated_args.concat ["-file", v.path]
57
+ end
58
+
59
+ puts "args: #{concated_args.join(' ')}"
60
+ concated_args
61
+ end
62
+ end
63
+ end
data/lib/hjc/util.rb ADDED
@@ -0,0 +1,21 @@
1
+ require 'tempfile'
2
+
3
+ module Hjc
4
+ class Util
5
+ TMP_DIR = 'tmp'
6
+
7
+ def self.to_temp_file(filename, body, options={})
8
+ file = Tempfile.new(filename, TMP_DIR)
9
+ file.print body
10
+ file.close
11
+
12
+ unless options.empty?
13
+ if options.keys.include? :mod
14
+ File.chmod(options[:mod], file.path)
15
+ end
16
+ end
17
+
18
+ file
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,19 @@
1
+ require 'hjc'
2
+
3
+ module Hjc
4
+ describe FsShell do
5
+ it 'put and get file to hdfs, and delete' do
6
+ localfile = Util.to_temp_file('localfile', 'fs shell test')
7
+
8
+ shell = FsShell.new
9
+
10
+ shell.put(localfile.path, 'remotefile')
11
+ shell.get('remotefile', 'tmp/returnedfile')
12
+
13
+ File.open('tmp/returnedfile').read.should == 'fs shell test'
14
+
15
+ shell.rm('remotefile')
16
+ File.delete('tmp/returnedfile')
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,119 @@
1
+ require 'hjc'
2
+ require 'fileutils'
3
+
4
+ module Hjc
5
+ describe HadoopStreaming do
6
+ TMP_DIR = 'tmp'
7
+
8
+ before :all do
9
+ @map_script = Util.to_temp_file('map.rb', MAPPER, :mod => 0700)
10
+ @reduce_script = Util.to_temp_file('reduce.rb', REDUCER, :mod => 0700)
11
+ @data_file = Util.to_temp_file('testdata', TEST_DATA)
12
+ @output_dir = TMP_DIR + '/out' + Time.new.to_i.to_s
13
+ end
14
+
15
+ it 'create args for hadoop streaming' do
16
+ job = HadoopStreaming.new
17
+ job.input_path = "input"
18
+ job.output_path = "outdir"
19
+ job.mapper_path = "mapper"
20
+ job.reducer_path = "reducer"
21
+
22
+ job.args.join(" ").should == "-input input -output outdir -mapper mapper -reducer reducer"
23
+ end
24
+
25
+ it 'create args for hadoop streaming with options' do
26
+ job = HadoopStreaming.new
27
+ job.options["dfs"] = "local"
28
+ job.options["jobconf"] = "mapred.map.tasks=1"
29
+
30
+ job.args.join(" ").should == %!-dfs local -jobconf mapred.map.tasks=1!
31
+ end
32
+
33
+ it 'create args for hadoop streaming with files' do
34
+ job = HadoopStreaming.new
35
+ job.input = 'input'
36
+ job.mapper = 'mapper'
37
+ job.reducer = 'reducer'
38
+
39
+ job.args.should include('-file')
40
+ job.args.join(" ").should match(/input/)
41
+ job.args.join(" ").should match(/mapper/)
42
+ job.args.join(" ").should match(/reducer/)
43
+ end
44
+
45
+ it 'can run Hadoop streaming job with path' do
46
+ pending 'path does not work'
47
+ job = HadoopStreaming.new
48
+ job.input_path = "file://" + File.expand_path(@data_file.path)
49
+ job.output_path = @output_dir
50
+ job.mapper_path = @map_script.path
51
+ job.reducer_path = @reduce_script.path
52
+ job.local = true
53
+
54
+ job.run
55
+
56
+ assert_result
57
+
58
+ clean_output
59
+ end
60
+
61
+ it 'can run Hadoop streaming job with string' do
62
+ job = HadoopStreaming.new
63
+ job.input = TEST_DATA
64
+ job.output_path = @output_dir
65
+ job.mapper = MAPPER
66
+ job.reducer = REDUCER
67
+ job.local = true
68
+
69
+ job.run
70
+
71
+ assert_result
72
+
73
+ clean_output
74
+ end
75
+
76
+ def assert_result
77
+ File.open(File.join(@output_dir, 'part-00000')) do |f|
78
+ h = {}
79
+ f.readlines.each do |line|
80
+ a = line.split /\t/
81
+ h[a[0]] = a[1].chomp
82
+ end
83
+ h.should == {'bar' => '1', 'foo' => '3', 'fuga' => '2', 'hoge' => '4'}
84
+ end
85
+ end
86
+
87
+ def clean_output
88
+ FileUtils.rm_r([@output_dir])
89
+ end
90
+
91
+ MAPPER = <<-'EOF'
92
+ #!/usr/bin/env ruby
93
+ ARGF.each do |line|
94
+ line.chomp!
95
+ line.split.each do |word|
96
+ puts "#{word}\t1"
97
+ end
98
+ end
99
+ EOF
100
+
101
+ REDUCER = <<-'EOF'
102
+ #!/usr/bin/env ruby
103
+ count = Hash.new {|h,k| h[k] = 0}
104
+ ARGF.each do |line|
105
+ line.chomp!
106
+ key, value = line.split(/\t/)
107
+ count[key] += 1
108
+ end
109
+ count.each do |k,v|
110
+ puts "#{k}\t#{v}"
111
+ end
112
+ EOF
113
+
114
+ TEST_DATA = <<-'EOF'
115
+ hoge fuga foo hoge foo
116
+ foo bar hoge hoge fuga
117
+ EOF
118
+ end
119
+ end
@@ -0,0 +1,18 @@
1
+ require 'hjc'
2
+
3
+ module Hjc
4
+ describe 'Hjc::util' do
5
+ it 'can convert string to file' do
6
+ f = Util.to_temp_file('map.rb', 'mapscript')
7
+
8
+ FileTest.exist?(f.path).should be_true
9
+ File.open(f.path).read.should == 'mapscript'
10
+ end
11
+
12
+ it 'can convert string to file with exec flag' do
13
+ f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
14
+
15
+ FileTest.executable?(f.path).should be_true
16
+ end
17
+ end
18
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hadoop-jruby-connector
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ - 2010122601
10
+ version: 0.0.1.2010122601
11
+ platform: ruby
12
+ authors:
13
+ - Koichi Fujikawa
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-12-26 00:00:00 +09:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Hadoop connector by JRuby
23
+ email: fujibee@hapyrus.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files:
29
+ - README
30
+ files:
31
+ - README
32
+ - Rakefile
33
+ - VERSION
34
+ - lib/hjc.rb
35
+ - lib/hjc/fs_shell.rb
36
+ - lib/hjc/hadoop_streaming.rb
37
+ - lib/hjc/util.rb
38
+ has_rdoc: true
39
+ homepage: http://github.com/hapyrus/hadoop-jruby-connector
40
+ licenses: []
41
+
42
+ post_install_message:
43
+ rdoc_options: []
44
+
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ segments:
52
+ - 0
53
+ version: "0"
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ requirements: []
62
+
63
+ rubyforge_project:
64
+ rubygems_version: 1.3.6
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: Hadoop connector by JRuby
68
+ test_files:
69
+ - spec/hjc/fs_shell_spec.rb
70
+ - spec/hjc/hadoop_streaming_spec.rb
71
+ - spec/hjc/util_spec.rb