hadoop-jruby-connector 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.8
1
+ 0.0.9
@@ -1,6 +1,6 @@
1
1
  module Hjc
2
2
  class HadoopStreaming
3
- attr_accessor :input_path, :output_path, :mapper_path, :reducer_path, :jobconf
3
+ attr_accessor :input_paths, :output_path, :mapper_path, :reducer_path, :jobconf
4
4
  attr_accessor :local, :debug
5
5
 
6
6
  def initialize
@@ -19,13 +19,20 @@ module Hjc
19
19
  @ret == 0 # success if job returned 0
20
20
  end
21
21
 
22
- def input=(input)
23
- # input param seems to explain exact path on Hadoop streaming..
24
- file = Util.to_temp_file('input', input)
25
- @input_path = Util.rel_path(file)
26
- unless @local # path seems on HDFS
27
- sh = FsShell.new
28
- sh.put(file.path, Util.rel_path(file))
22
+ def input_path=(input) # compatibility
23
+ self.input = input
24
+ end
25
+
26
+ def input=(*inputs)
27
+ @input_paths = []
28
+ inputs.each do |input|
29
+ # input param seems to explain exact path on Hadoop streaming..
30
+ file = Util.to_temp_file('input', input)
31
+ @input_paths << Util.rel_path(file)
32
+ unless @local # path seems on HDFS
33
+ sh = FsShell.new
34
+ sh.put(file.path, Util.rel_path(file))
35
+ end
29
36
  end
30
37
  end
31
38
 
@@ -45,7 +52,7 @@ module Hjc
45
52
 
46
53
  def args
47
54
  concated_args = []
48
- concated_args.concat ['-input', @input_path] if @input_path
55
+ concated_args.concat @input_paths.collect{|e| ['-input', e]}.flatten if @input_paths
49
56
  concated_args.concat ['-output' ,@output_path] if @output_path
50
57
  concated_args.concat ['-mapper', @mapper_path] if @mapper_path
51
58
  concated_args.concat ['-reducer', @reducer_path] if @reducer_path
@@ -54,11 +61,11 @@ module Hjc
54
61
  concated_args.concat ['-debug'] if @debug
55
62
 
56
63
  @jobconf.each do |k, v|
57
- concated_args += ['-jobconf', "#{k}=#{v}"]
64
+ concated_args += ['-jobconf', "#{k}=#{v}"]
58
65
  end
59
66
 
60
67
  @files.each do |k, v|
61
- concated_args.concat ["-file", v.path]
68
+ concated_args.concat ["-file", v.path]
62
69
  end
63
70
 
64
71
  puts "args: #{concated_args.join(' ')}" if @debug
metadata CHANGED
@@ -1,12 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hadoop-jruby-connector
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 0
7
- - 0
8
- - 8
9
- version: 0.0.8
4
+ prerelease:
5
+ version: 0.0.9
10
6
  platform: ruby
11
7
  authors:
12
8
  - Koichi Fujikawa
@@ -14,7 +10,7 @@ autorequire:
14
10
  bindir: bin
15
11
  cert_chain: []
16
12
 
17
- date: 2011-08-24 00:00:00 +09:00
13
+ date: 2011-10-27 00:00:00 -07:00
18
14
  default_executable:
19
15
  dependencies: []
20
16
 
@@ -46,29 +42,23 @@ rdoc_options: []
46
42
  require_paths:
47
43
  - lib
48
44
  required_ruby_version: !ruby/object:Gem::Requirement
45
+ none: false
49
46
  requirements:
50
47
  - - ">="
51
48
  - !ruby/object:Gem::Version
52
- segments:
53
- - 0
54
49
  version: "0"
55
50
  required_rubygems_version: !ruby/object:Gem::Requirement
51
+ none: false
56
52
  requirements:
57
53
  - - ">="
58
54
  - !ruby/object:Gem::Version
59
- segments:
60
- - 0
61
55
  version: "0"
62
56
  requirements: []
63
57
 
64
58
  rubyforge_project:
65
- rubygems_version: 1.3.6
59
+ rubygems_version: 1.5.1
66
60
  signing_key:
67
61
  specification_version: 3
68
62
  summary: Hadoop connector by JRuby
69
- test_files:
70
- - spec/hjc/fs_shell_spec.rb
71
- - spec/hjc/hadoop_streaming_spec.rb
72
- - spec/hjc/jar_job_spec.rb
73
- - spec/hjc/job_monitor_spec.rb
74
- - spec/hjc/util_spec.rb
63
+ test_files: []
64
+
@@ -1,19 +0,0 @@
1
- require 'hjc'
2
-
3
- module Hjc
4
- describe FsShell do
5
- it 'put and get file to hdfs, and delete' do
6
- localfile = Util.to_temp_file('localfile', 'fs shell test')
7
-
8
- shell = FsShell.new
9
-
10
- shell.put(localfile.path, 'remotefile')
11
- shell.get('remotefile', 'tmp/returnedfile')
12
-
13
- File.open('tmp/returnedfile').read.should == 'fs shell test'
14
-
15
- shell.rm('remotefile')
16
- File.delete('tmp/returnedfile')
17
- end
18
- end
19
- end
@@ -1,132 +0,0 @@
1
- require 'hjc'
2
- require 'fileutils'
3
-
4
- module Hjc
5
- describe HadoopStreaming do
6
- TMP_DIR = 'tmp'
7
-
8
- before :all do
9
- @map_script = Util.to_temp_file('map.rb', MAPPER, :mod => 0700)
10
- @reduce_script = Util.to_temp_file('reduce.rb', REDUCER, :mod => 0700)
11
- @data_file = Util.to_temp_file('testdata', TEST_DATA)
12
- @output_dir = TMP_DIR + '/out' + Time.new.to_i.to_s
13
- end
14
-
15
- it 'create args for hadoop streaming' do
16
- job = HadoopStreaming.new
17
- job.input_path = "input"
18
- job.output_path = "outdir"
19
- job.mapper_path = "mapper"
20
- job.reducer_path = "reducer"
21
-
22
- job.args.join(" ").should == "-input input -output outdir -mapper mapper -reducer reducer"
23
- end
24
-
25
- it 'create args for hadoop streaming with files' do
26
- job = HadoopStreaming.new
27
- job.input = 'input'
28
- job.mapper = 'mapper'
29
- job.reducer = 'reducer'
30
-
31
- job.args.should include('-file')
32
- job.args.join(" ").should match(/input/)
33
- job.args.join(" ").should match(/mapper/)
34
- job.args.join(" ").should match(/reducer/)
35
- end
36
-
37
- it 'can run Hadoop streaming job with path' do
38
- pending 'path does not work'
39
- job = HadoopStreaming.new
40
- job.input_path = "file://" + File.expand_path(@data_file.path)
41
- job.output_path = @output_dir
42
- job.mapper_path = @map_script.path
43
- job.reducer_path = @reduce_script.path
44
- job.local = true
45
-
46
- job.run
47
-
48
- assert_result
49
-
50
- clean_output
51
- end
52
-
53
- it 'can run Hadoop streaming job with string' do
54
- pending
55
- job = HadoopStreaming.new
56
- job.input = TEST_DATA
57
- job.output_path = @output_dir
58
- job.mapper = MAPPER
59
- job.reducer = REDUCER
60
- job.local = true
61
-
62
- job.run
63
-
64
- assert_result
65
-
66
- clean_output
67
- end
68
-
69
- it 'sets conf params' do
70
- job = HadoopStreaming.new
71
- job.input_path = "input"
72
- job.output_path = "outdir"
73
- job.jobconf['hoge'] = "fuga"
74
-
75
- job.args.join(" ").should == %Q!-input input -output outdir -jobconf hoge=fuga!
76
- end
77
-
78
- describe '.add_file' do
79
- it 'adds -file option' do
80
- job = HadoopStreaming.new
81
- file = Tempfile.new('additional.txt', 'tmp')
82
- job.add_file(file)
83
-
84
- job.args.join(" ").should match("-file")
85
- job.args.join(" ").should match("tmp/additional.txt")
86
- end
87
- end
88
-
89
- def assert_result
90
- File.open(File.join(@output_dir, 'part-00000')) do |f|
91
- h = {}
92
- f.readlines.each do |line|
93
- a = line.split /\t/
94
- h[a[0]] = a[1].chomp
95
- end
96
- h.should == {'bar' => '1', 'foo' => '3', 'fuga' => '2', 'hoge' => '4'}
97
- end
98
- end
99
-
100
- def clean_output
101
- FileUtils.rm_r([@output_dir])
102
- end
103
-
104
- MAPPER = <<-'EOF'
105
- #!/usr/bin/env ruby
106
- ARGF.each do |line|
107
- line.chomp!
108
- line.split.each do |word|
109
- puts "#{word}\t1"
110
- end
111
- end
112
- EOF
113
-
114
- REDUCER = <<-'EOF'
115
- #!/usr/bin/env ruby
116
- count = Hash.new {|h,k| h[k] = 0}
117
- ARGF.each do |line|
118
- line.chomp!
119
- key, value = line.split(/\t/)
120
- count[key] += 1
121
- end
122
- count.each do |k,v|
123
- puts "#{k}\t#{v}"
124
- end
125
- EOF
126
-
127
- TEST_DATA = <<-'EOF'
128
- hoge fuga foo hoge foo
129
- foo bar hoge hoge fuga
130
- EOF
131
- end
132
- end
@@ -1,25 +0,0 @@
1
- require 'hjc'
2
-
3
- module Hjc
4
- describe JarJob do
5
-
6
- before :all do
7
- @example_jar = Dir.glob(File.join(Util.hadoop_home, 'hadoop-*-examples.jar')).first
8
- end
9
-
10
- it 'creates hadoop jar job args' do
11
- job = JarJob.new
12
- job.jar_args = %w!pi 1 10!
13
- job.args.join(" ").should == 'pi 1 10'
14
- end
15
-
16
- it 'can run hadoop jar job' do
17
- job = JarJob.new
18
- job.jar_file = @example_jar
19
- job.main_class = 'org.apache.hadoop.examples.ExampleDriver'
20
- job.jar_args = %w!pi 1 10!
21
- p job.args
22
- job.run
23
- end
24
- end
25
- end
@@ -1,55 +0,0 @@
1
- require 'hjc'
2
-
3
- module Hjc
4
- describe JobMonitor do
5
-
6
- before :each do
7
- @example_jar = Dir.glob(File.join(Util.hadoop_home, 'hadoop-*-examples.jar')).first
8
- s = Hjc::FsShell.new
9
- s.rmr('PiEstimator_TMP_3_141592654')
10
- end
11
-
12
- it 'monitors hadoop job' do
13
- pending
14
- run_job_async
15
- sleep 5
16
- monitor = JobMonitor.new
17
- jobs = monitor.running_jobs
18
- jobs.size.should > 0
19
-
20
- job = jobs.first
21
- job_id_str = job.job_id.to_s
22
- job_id_str.should match(/^job_/)
23
-
24
- monitor.job_status(job_id_str).class.should == JobStatus
25
- end
26
-
27
- it 'kills job' do
28
- run_job_async
29
- sleep 5
30
- monitor = JobMonitor.new
31
- job = monitor.running_jobs.first
32
- job.should_not be_nil
33
-
34
- job_id_str = job.job_id.to_s
35
- monitor.kill_job(job_id_str)
36
-
37
- sleep 60
38
- monitor.running_jobs.size.should == 0
39
- end
40
-
41
- def run_job_async
42
- begin
43
- Thread.new do
44
- job = JarJob.new
45
- job.jar_file = @example_jar
46
- job.main_class = 'org.apache.hadoop.examples.ExampleDriver'
47
- job.jar_args = %w!pi 1 10!
48
- job.run
49
- end
50
- rescue => e
51
- p e
52
- end
53
- end
54
- end
55
- end
@@ -1,25 +0,0 @@
1
- require 'hjc'
2
-
3
- module Hjc
4
- describe 'Hjc::util' do
5
- it 'can convert string to file' do
6
- f = Util.to_temp_file('map.rb', 'mapscript')
7
-
8
- FileTest.exist?(f.path).should be_true
9
- File.open(f.path).read.should == 'mapscript'
10
- end
11
-
12
- it 'can convert string to file with exec flag' do
13
- f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
14
-
15
- FileTest.executable?(f.path).should be_true
16
- end
17
-
18
- it 'returns relative path from tempfile' do
19
- f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
20
-
21
- filename = File.basename(f.path)
22
- Util.rel_path(f).should == "tmp/#{filename}"
23
- end
24
- end
25
- end