hadoop-jruby-connector 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.8
1
+ 0.0.9
@@ -1,6 +1,6 @@
1
1
  module Hjc
2
2
  class HadoopStreaming
3
- attr_accessor :input_path, :output_path, :mapper_path, :reducer_path, :jobconf
3
+ attr_accessor :input_paths, :output_path, :mapper_path, :reducer_path, :jobconf
4
4
  attr_accessor :local, :debug
5
5
 
6
6
  def initialize
@@ -19,13 +19,20 @@ module Hjc
19
19
  @ret == 0 # success if job returned 0
20
20
  end
21
21
 
22
- def input=(input)
23
- # input param seems to explain exact path on Hadoop streaming..
24
- file = Util.to_temp_file('input', input)
25
- @input_path = Util.rel_path(file)
26
- unless @local # path seems on HDFS
27
- sh = FsShell.new
28
- sh.put(file.path, Util.rel_path(file))
22
+ def input_path=(input) # compatibility
23
+ self.input = input
24
+ end
25
+
26
+ def input=(*inputs)
27
+ @input_paths = []
28
+ inputs.each do |input|
29
+ # input param seems to explain exact path on Hadoop streaming..
30
+ file = Util.to_temp_file('input', input)
31
+ @input_paths << Util.rel_path(file)
32
+ unless @local # path seems on HDFS
33
+ sh = FsShell.new
34
+ sh.put(file.path, Util.rel_path(file))
35
+ end
29
36
  end
30
37
  end
31
38
 
@@ -45,7 +52,7 @@ module Hjc
45
52
 
46
53
  def args
47
54
  concated_args = []
48
- concated_args.concat ['-input', @input_path] if @input_path
55
+ concated_args.concat @input_paths.collect{|e| ['-input', e]}.flatten if @input_paths
49
56
  concated_args.concat ['-output' ,@output_path] if @output_path
50
57
  concated_args.concat ['-mapper', @mapper_path] if @mapper_path
51
58
  concated_args.concat ['-reducer', @reducer_path] if @reducer_path
@@ -54,11 +61,11 @@ module Hjc
54
61
  concated_args.concat ['-debug'] if @debug
55
62
 
56
63
  @jobconf.each do |k, v|
57
- concated_args += ['-jobconf', "#{k}=#{v}"]
64
+ concated_args += ['-jobconf', "#{k}=#{v}"]
58
65
  end
59
66
 
60
67
  @files.each do |k, v|
61
- concated_args.concat ["-file", v.path]
68
+ concated_args.concat ["-file", v.path]
62
69
  end
63
70
 
64
71
  puts "args: #{concated_args.join(' ')}" if @debug
metadata CHANGED
@@ -1,12 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hadoop-jruby-connector
3
3
  version: !ruby/object:Gem::Version
4
- prerelease: false
5
- segments:
6
- - 0
7
- - 0
8
- - 8
9
- version: 0.0.8
4
+ prerelease:
5
+ version: 0.0.9
10
6
  platform: ruby
11
7
  authors:
12
8
  - Koichi Fujikawa
@@ -14,7 +10,7 @@ autorequire:
14
10
  bindir: bin
15
11
  cert_chain: []
16
12
 
17
- date: 2011-08-24 00:00:00 +09:00
13
+ date: 2011-10-27 00:00:00 -07:00
18
14
  default_executable:
19
15
  dependencies: []
20
16
 
@@ -46,29 +42,23 @@ rdoc_options: []
46
42
  require_paths:
47
43
  - lib
48
44
  required_ruby_version: !ruby/object:Gem::Requirement
45
+ none: false
49
46
  requirements:
50
47
  - - ">="
51
48
  - !ruby/object:Gem::Version
52
- segments:
53
- - 0
54
49
  version: "0"
55
50
  required_rubygems_version: !ruby/object:Gem::Requirement
51
+ none: false
56
52
  requirements:
57
53
  - - ">="
58
54
  - !ruby/object:Gem::Version
59
- segments:
60
- - 0
61
55
  version: "0"
62
56
  requirements: []
63
57
 
64
58
  rubyforge_project:
65
- rubygems_version: 1.3.6
59
+ rubygems_version: 1.5.1
66
60
  signing_key:
67
61
  specification_version: 3
68
62
  summary: Hadoop connector by JRuby
69
- test_files:
70
- - spec/hjc/fs_shell_spec.rb
71
- - spec/hjc/hadoop_streaming_spec.rb
72
- - spec/hjc/jar_job_spec.rb
73
- - spec/hjc/job_monitor_spec.rb
74
- - spec/hjc/util_spec.rb
63
+ test_files: []
64
+
@@ -1,19 +0,0 @@
1
- require 'hjc'
2
-
3
- module Hjc
4
- describe FsShell do
5
- it 'put and get file to hdfs, and delete' do
6
- localfile = Util.to_temp_file('localfile', 'fs shell test')
7
-
8
- shell = FsShell.new
9
-
10
- shell.put(localfile.path, 'remotefile')
11
- shell.get('remotefile', 'tmp/returnedfile')
12
-
13
- File.open('tmp/returnedfile').read.should == 'fs shell test'
14
-
15
- shell.rm('remotefile')
16
- File.delete('tmp/returnedfile')
17
- end
18
- end
19
- end
@@ -1,132 +0,0 @@
1
- require 'hjc'
2
- require 'fileutils'
3
-
4
- module Hjc
5
- describe HadoopStreaming do
6
- TMP_DIR = 'tmp'
7
-
8
- before :all do
9
- @map_script = Util.to_temp_file('map.rb', MAPPER, :mod => 0700)
10
- @reduce_script = Util.to_temp_file('reduce.rb', REDUCER, :mod => 0700)
11
- @data_file = Util.to_temp_file('testdata', TEST_DATA)
12
- @output_dir = TMP_DIR + '/out' + Time.new.to_i.to_s
13
- end
14
-
15
- it 'create args for hadoop streaming' do
16
- job = HadoopStreaming.new
17
- job.input_path = "input"
18
- job.output_path = "outdir"
19
- job.mapper_path = "mapper"
20
- job.reducer_path = "reducer"
21
-
22
- job.args.join(" ").should == "-input input -output outdir -mapper mapper -reducer reducer"
23
- end
24
-
25
- it 'create args for hadoop streaming with files' do
26
- job = HadoopStreaming.new
27
- job.input = 'input'
28
- job.mapper = 'mapper'
29
- job.reducer = 'reducer'
30
-
31
- job.args.should include('-file')
32
- job.args.join(" ").should match(/input/)
33
- job.args.join(" ").should match(/mapper/)
34
- job.args.join(" ").should match(/reducer/)
35
- end
36
-
37
- it 'can run Hadoop streaming job with path' do
38
- pending 'path does not work'
39
- job = HadoopStreaming.new
40
- job.input_path = "file://" + File.expand_path(@data_file.path)
41
- job.output_path = @output_dir
42
- job.mapper_path = @map_script.path
43
- job.reducer_path = @reduce_script.path
44
- job.local = true
45
-
46
- job.run
47
-
48
- assert_result
49
-
50
- clean_output
51
- end
52
-
53
- it 'can run Hadoop streaming job with string' do
54
- pending
55
- job = HadoopStreaming.new
56
- job.input = TEST_DATA
57
- job.output_path = @output_dir
58
- job.mapper = MAPPER
59
- job.reducer = REDUCER
60
- job.local = true
61
-
62
- job.run
63
-
64
- assert_result
65
-
66
- clean_output
67
- end
68
-
69
- it 'sets conf params' do
70
- job = HadoopStreaming.new
71
- job.input_path = "input"
72
- job.output_path = "outdir"
73
- job.jobconf['hoge'] = "fuga"
74
-
75
- job.args.join(" ").should == %Q!-input input -output outdir -jobconf hoge=fuga!
76
- end
77
-
78
- describe '.add_file' do
79
- it 'adds -file option' do
80
- job = HadoopStreaming.new
81
- file = Tempfile.new('additional.txt', 'tmp')
82
- job.add_file(file)
83
-
84
- job.args.join(" ").should match("-file")
85
- job.args.join(" ").should match("tmp/additional.txt")
86
- end
87
- end
88
-
89
- def assert_result
90
- File.open(File.join(@output_dir, 'part-00000')) do |f|
91
- h = {}
92
- f.readlines.each do |line|
93
- a = line.split /\t/
94
- h[a[0]] = a[1].chomp
95
- end
96
- h.should == {'bar' => '1', 'foo' => '3', 'fuga' => '2', 'hoge' => '4'}
97
- end
98
- end
99
-
100
- def clean_output
101
- FileUtils.rm_r([@output_dir])
102
- end
103
-
104
- MAPPER = <<-'EOF'
105
- #!/usr/bin/env ruby
106
- ARGF.each do |line|
107
- line.chomp!
108
- line.split.each do |word|
109
- puts "#{word}\t1"
110
- end
111
- end
112
- EOF
113
-
114
- REDUCER = <<-'EOF'
115
- #!/usr/bin/env ruby
116
- count = Hash.new {|h,k| h[k] = 0}
117
- ARGF.each do |line|
118
- line.chomp!
119
- key, value = line.split(/\t/)
120
- count[key] += 1
121
- end
122
- count.each do |k,v|
123
- puts "#{k}\t#{v}"
124
- end
125
- EOF
126
-
127
- TEST_DATA = <<-'EOF'
128
- hoge fuga foo hoge foo
129
- foo bar hoge hoge fuga
130
- EOF
131
- end
132
- end
@@ -1,25 +0,0 @@
1
- require 'hjc'
2
-
3
- module Hjc
4
- describe JarJob do
5
-
6
- before :all do
7
- @example_jar = Dir.glob(File.join(Util.hadoop_home, 'hadoop-*-examples.jar')).first
8
- end
9
-
10
- it 'creates hadoop jar job args' do
11
- job = JarJob.new
12
- job.jar_args = %w!pi 1 10!
13
- job.args.join(" ").should == 'pi 1 10'
14
- end
15
-
16
- it 'can run hadoop jar job' do
17
- job = JarJob.new
18
- job.jar_file = @example_jar
19
- job.main_class = 'org.apache.hadoop.examples.ExampleDriver'
20
- job.jar_args = %w!pi 1 10!
21
- p job.args
22
- job.run
23
- end
24
- end
25
- end
@@ -1,55 +0,0 @@
1
- require 'hjc'
2
-
3
- module Hjc
4
- describe JobMonitor do
5
-
6
- before :each do
7
- @example_jar = Dir.glob(File.join(Util.hadoop_home, 'hadoop-*-examples.jar')).first
8
- s = Hjc::FsShell.new
9
- s.rmr('PiEstimator_TMP_3_141592654')
10
- end
11
-
12
- it 'monitors hadoop job' do
13
- pending
14
- run_job_async
15
- sleep 5
16
- monitor = JobMonitor.new
17
- jobs = monitor.running_jobs
18
- jobs.size.should > 0
19
-
20
- job = jobs.first
21
- job_id_str = job.job_id.to_s
22
- job_id_str.should match(/^job_/)
23
-
24
- monitor.job_status(job_id_str).class.should == JobStatus
25
- end
26
-
27
- it 'kills job' do
28
- run_job_async
29
- sleep 5
30
- monitor = JobMonitor.new
31
- job = monitor.running_jobs.first
32
- job.should_not be_nil
33
-
34
- job_id_str = job.job_id.to_s
35
- monitor.kill_job(job_id_str)
36
-
37
- sleep 60
38
- monitor.running_jobs.size.should == 0
39
- end
40
-
41
- def run_job_async
42
- begin
43
- Thread.new do
44
- job = JarJob.new
45
- job.jar_file = @example_jar
46
- job.main_class = 'org.apache.hadoop.examples.ExampleDriver'
47
- job.jar_args = %w!pi 1 10!
48
- job.run
49
- end
50
- rescue => e
51
- p e
52
- end
53
- end
54
- end
55
- end
@@ -1,25 +0,0 @@
1
- require 'hjc'
2
-
3
- module Hjc
4
- describe 'Hjc::util' do
5
- it 'can convert string to file' do
6
- f = Util.to_temp_file('map.rb', 'mapscript')
7
-
8
- FileTest.exist?(f.path).should be_true
9
- File.open(f.path).read.should == 'mapscript'
10
- end
11
-
12
- it 'can convert string to file with exec flag' do
13
- f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
14
-
15
- FileTest.executable?(f.path).should be_true
16
- end
17
-
18
- it 'returns relative path from tempfile' do
19
- f = Util.to_temp_file('map.rb', 'mapscript', :mod => 0700)
20
-
21
- filename = File.basename(f.path)
22
- Util.rel_path(f).should == "tmp/#{filename}"
23
- end
24
- end
25
- end