hadoop-rubydsl 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +58 -0
- data/VERSION +1 -1
- data/bin/hrd +5 -0
- data/hadoop-rubydsl.gemspec +11 -9
- data/lib/client.rb +27 -0
- data/lib/{init.rb → dsl_init.rb} +4 -22
- data/lib/hadoop-dsl.rb +12 -0
- data/lib/util.rb +12 -1
- data/spec/client_spec.rb +26 -0
- data/spec/core_spec.rb +1 -1
- data/spec/{init_spec.rb → dsl_init_spec.rb} +6 -13
- data/spec/hive_like_spec.rb +0 -1
- data/spec/log_analysis_spec.rb +0 -1
- data/spec/util_spec.rb +4 -0
- data/spec/word_count_spec.rb +0 -1
- metadata +11 -8
- data/README +0 -53
- data/lib/java/.gitignore +0 -1
- data/lib/java/hadoop-ruby.jar +0 -0
data/README.rdoc
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
= hadoop-rubydsl
|
2
|
+
|
3
|
+
Enable to run Ruby DSL script on your Hadoop.
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
You can write DSL by Ruby to run Hadoop as Mapper / Reducer.
|
8
|
+
This gem depends on 'jruby-on-hadoop' project.
|
9
|
+
|
10
|
+
== Install
|
11
|
+
|
12
|
+
Required gems are all on GemCutter.
|
13
|
+
|
14
|
+
1. Upgrade your rubygem to 1.3.5
|
15
|
+
2. Install gems
|
16
|
+
$ gem install hadoop-rubydsl
|
17
|
+
|
18
|
+
== Usage
|
19
|
+
|
20
|
+
1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
|
21
|
+
2. put files into your hdfs. ex) wc/inputs/file1
|
22
|
+
3. Now you can run 'hrd' like below:
|
23
|
+
$ hrd examples/word_count_test.rb
|
24
|
+
You can get Hadoop job results in your hdfs wc/outputs/part-*
|
25
|
+
|
26
|
+
== Examples
|
27
|
+
|
28
|
+
Word Count DSL script
|
29
|
+
use 'WordCount'
|
30
|
+
|
31
|
+
from 'wc/inputs'
|
32
|
+
to 'wc/outputs'
|
33
|
+
|
34
|
+
count_uniq
|
35
|
+
total :bytes, :words, :lines
|
36
|
+
|
37
|
+
Log Analysis DSL script
|
38
|
+
use 'LogAnalysis'
|
39
|
+
|
40
|
+
data 'apache log on test2' do
|
41
|
+
from 'apachelog/inputs'
|
42
|
+
to 'apachelog/outputs'
|
43
|
+
|
44
|
+
each_line do
|
45
|
+
pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
|
46
|
+
column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
|
47
|
+
|
48
|
+
topic 'ua counts', :label => 'ua' do
|
49
|
+
count_uniq column[:ua]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
== Author
|
55
|
+
Koichi Fujikawa <fujibee@gmail.com>
|
56
|
+
|
57
|
+
== Copyright
|
58
|
+
License: Apache License
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.3
|
data/bin/hrd
ADDED
data/hadoop-rubydsl.gemspec
CHANGED
@@ -5,26 +5,27 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{hadoop-rubydsl}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Koichi Fujikawa"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2010-01-04}
|
13
13
|
s.description = %q{Hadoop Ruby DSL}
|
14
14
|
s.email = %q{fujibee@gmail.com}
|
15
|
-
s.executables = ["hadoop-hudson.sh", "hadoop-ruby.sh"]
|
15
|
+
s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
|
16
16
|
s.extra_rdoc_files = [
|
17
|
-
"README",
|
17
|
+
"README.rdoc",
|
18
18
|
"TODO"
|
19
19
|
]
|
20
20
|
s.files = [
|
21
21
|
".gitignore",
|
22
|
-
"README",
|
22
|
+
"README.rdoc",
|
23
23
|
"Rakefile",
|
24
24
|
"TODO",
|
25
25
|
"VERSION",
|
26
26
|
"bin/hadoop-hudson.sh",
|
27
27
|
"bin/hadoop-ruby.sh",
|
28
|
+
"bin/hrd",
|
28
29
|
"conf/hadoop-site.xml",
|
29
30
|
"examples/apachelog-v2-2.rb",
|
30
31
|
"examples/apachelog-v2.rb",
|
@@ -32,11 +33,11 @@ Gem::Specification.new do |s|
|
|
32
33
|
"examples/hive_like_test.rb",
|
33
34
|
"examples/word_count_test.rb",
|
34
35
|
"hadoop-rubydsl.gemspec",
|
36
|
+
"lib/client.rb",
|
35
37
|
"lib/core.rb",
|
38
|
+
"lib/dsl_init.rb",
|
39
|
+
"lib/hadoop-dsl.rb",
|
36
40
|
"lib/hive_like.rb",
|
37
|
-
"lib/init.rb",
|
38
|
-
"lib/java/.gitignore",
|
39
|
-
"lib/java/hadoop-ruby.jar",
|
40
41
|
"lib/log_analysis.rb",
|
41
42
|
"lib/mapred_factory.rb",
|
42
43
|
"lib/util.rb",
|
@@ -49,14 +50,15 @@ Gem::Specification.new do |s|
|
|
49
50
|
s.summary = %q{Hadoop Ruby DSL}
|
50
51
|
s.test_files = [
|
51
52
|
"spec/spec_helper.rb",
|
53
|
+
"spec/dsl_init_spec.rb",
|
52
54
|
"spec/core_spec.rb",
|
55
|
+
"spec/client_spec.rb",
|
53
56
|
"spec/util_spec.rb",
|
54
57
|
"spec/mapred_factory_spec.rb",
|
55
58
|
"spec/word_count_spec.rb",
|
56
59
|
"spec/hive_like_spec.rb",
|
57
60
|
"spec/log_analysis_spec.rb",
|
58
61
|
"spec/example_spec.rb",
|
59
|
-
"spec/init_spec.rb",
|
60
62
|
"examples/apachelog-v2.rb",
|
61
63
|
"examples/hive_like_test.rb",
|
62
64
|
"examples/word_count_test.rb",
|
data/lib/client.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
module HadoopDsl
|
2
|
+
class Client < JRubyOnHadoop::Client
|
3
|
+
def parse_args
|
4
|
+
super
|
5
|
+
@script_path = HadoopDsl.dsl_init_script
|
6
|
+
@script = File.basename(@script_path)
|
7
|
+
@dsl_file_path = @args[0]
|
8
|
+
@dsl_file = File.basename(@dsl_file_path)
|
9
|
+
@files << @script_path << @dsl_file_path
|
10
|
+
|
11
|
+
# TODO move properly, with jruby-on-hadoop
|
12
|
+
add_dsl_lib_files
|
13
|
+
ENV['RUBYLIB'] = File.dirname(@dsl_file_path)
|
14
|
+
end
|
15
|
+
|
16
|
+
def mapred_args
|
17
|
+
args = super
|
18
|
+
args += " --dslfile #{@dsl_file}"
|
19
|
+
args
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_dsl_lib_files
|
23
|
+
lib_path = HadoopDsl.lib_path
|
24
|
+
@files += Dir.glob(File.join(lib_path, "*.rb"))
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/{init.rb → dsl_init.rb}
RENAMED
@@ -12,15 +12,14 @@ HadoopDsl::Text = Text
|
|
12
12
|
HadoopDsl::IntWritable = IntWritable
|
13
13
|
|
14
14
|
def map(key, value, output, reporter, script)
|
15
|
-
mapper = MapperFactory.create(script, key
|
15
|
+
mapper = MapperFactory.create(script, key, value)
|
16
16
|
mapper.run
|
17
17
|
|
18
18
|
write(output, mapper)
|
19
19
|
end
|
20
20
|
|
21
21
|
def reduce(key, values, output, reporter, script)
|
22
|
-
|
23
|
-
reducer = ReducerFactory.create(script, key.to_string, ruby_values)
|
22
|
+
reducer = ReducerFactory.create(script, key, values)
|
24
23
|
reducer.run
|
25
24
|
|
26
25
|
write(output, reducer)
|
@@ -29,8 +28,7 @@ end
|
|
29
28
|
def setup(conf, script)
|
30
29
|
setup = SetupFactory.create(script, conf)
|
31
30
|
setup.run
|
32
|
-
|
33
|
-
setup.paths.to_java
|
31
|
+
setup.paths
|
34
32
|
end
|
35
33
|
|
36
34
|
private
|
@@ -38,23 +36,7 @@ private
|
|
38
36
|
def write(output, controller)
|
39
37
|
controller.emitted.each do |e|
|
40
38
|
e.each do |k, v|
|
41
|
-
output.collect(
|
39
|
+
output.collect(k, v)
|
42
40
|
end
|
43
41
|
end
|
44
42
|
end
|
45
|
-
|
46
|
-
def to_ruby(value)
|
47
|
-
case value
|
48
|
-
when IntWritable then value.get
|
49
|
-
when Text then value.to_string
|
50
|
-
else raise "no match class: #{value.class}"
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def to_hadoop(value)
|
55
|
-
case value
|
56
|
-
when Integer then IntWritable.new(value)
|
57
|
-
when String then t = Text.new; t.set(value); t
|
58
|
-
else raise "no match class: #{value.class}"
|
59
|
-
end
|
60
|
-
end
|
data/lib/hadoop-dsl.rb
ADDED
data/lib/util.rb
CHANGED
@@ -6,6 +6,17 @@ module HadoopDsl
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def read_file(file_name)
|
9
|
-
|
9
|
+
# read as usual
|
10
|
+
body = File.open(file_name).read rescue nil
|
11
|
+
return body if body
|
12
|
+
|
13
|
+
# read from loadpath
|
14
|
+
$:.each do |path|
|
15
|
+
p path
|
16
|
+
body = File.open(File.join(path, file_name)).read rescue next
|
17
|
+
return body if body
|
18
|
+
end
|
19
|
+
|
20
|
+
raise "cannot find file - #{file_name}"
|
10
21
|
end
|
11
22
|
end
|
data/spec/client_spec.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'hadoop-dsl'
|
2
|
+
|
3
|
+
describe HadoopDsl::Client do
|
4
|
+
before do
|
5
|
+
@client = HadoopDsl::Client.new(["examples/wordcount.rb", "in", "out"])
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'can parse args' do
|
9
|
+
@client.files.join.should match /ruby_wrapper\.rb/
|
10
|
+
@client.files.join.should match /dsl_init\.rb/
|
11
|
+
@client.files.should include 'examples/wordcount.rb'
|
12
|
+
@client.inputs.should == 'in'
|
13
|
+
@client.outputs.should == 'out'
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'can add dsl file into mapred args' do
|
17
|
+
@client.mapred_args.should ==
|
18
|
+
"--script dsl_init.rb in out --dslfile wordcount.rb"
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'can add dsl lib files' do
|
22
|
+
lib_path = HadoopDsl.lib_path
|
23
|
+
@client.files.should include File.join(lib_path, 'core.rb')
|
24
|
+
@client.files.should include File.join(lib_path, 'log_analysis.rb')
|
25
|
+
end
|
26
|
+
end
|
data/spec/core_spec.rb
CHANGED
@@ -1,9 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
require 'init'
|
3
|
-
|
4
|
-
import 'org.apache.hadoop.io.IntWritable'
|
5
|
-
import 'org.apache.hadoop.io.Text'
|
6
|
-
import 'org.apache.hadoop.mapred.JobConf'
|
1
|
+
require 'dsl_init'
|
7
2
|
|
8
3
|
describe 'mapreduce init' do
|
9
4
|
|
@@ -24,22 +19,20 @@ end
|
|
24
19
|
end
|
25
20
|
|
26
21
|
before do
|
27
|
-
@one =
|
22
|
+
@one = 1
|
28
23
|
@output = mock('output')
|
29
24
|
end
|
30
25
|
|
31
26
|
it 'can map sucessfully' do
|
32
|
-
key
|
33
|
-
|
34
|
-
value.set('it should be fine')
|
27
|
+
key = 'key'
|
28
|
+
value = 'it should be fine'
|
35
29
|
@output.should_receive(:collect).once #.with(@text, @one)
|
36
30
|
|
37
31
|
map(key, value, @output, nil, @script)
|
38
32
|
end
|
39
33
|
|
40
34
|
it 'can reduce sucessfully' do
|
41
|
-
key
|
42
|
-
key.set("t1\tkey")
|
35
|
+
key = "t1\tkey"
|
43
36
|
values = [@one, @one, @one]
|
44
37
|
@output.should_receive(:collect).once #.with(@text, @one)
|
45
38
|
|
@@ -47,7 +40,7 @@ end
|
|
47
40
|
end
|
48
41
|
|
49
42
|
it 'can set job conf' do
|
50
|
-
conf =
|
43
|
+
conf = mock('jobconf')
|
51
44
|
paths = setup(conf, @script)
|
52
45
|
|
53
46
|
paths[0].should == 'test/inputs'
|
data/spec/hive_like_spec.rb
CHANGED
data/spec/log_analysis_spec.rb
CHANGED
data/spec/util_spec.rb
CHANGED
data/spec/word_count_spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hadoop-rubydsl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Koichi Fujikawa
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-04 00:00:00 +09:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -25,21 +25,23 @@ dependencies:
|
|
25
25
|
description: Hadoop Ruby DSL
|
26
26
|
email: fujibee@gmail.com
|
27
27
|
executables:
|
28
|
+
- hrd
|
28
29
|
- hadoop-hudson.sh
|
29
30
|
- hadoop-ruby.sh
|
30
31
|
extensions: []
|
31
32
|
|
32
33
|
extra_rdoc_files:
|
33
|
-
- README
|
34
|
+
- README.rdoc
|
34
35
|
- TODO
|
35
36
|
files:
|
36
37
|
- .gitignore
|
37
|
-
- README
|
38
|
+
- README.rdoc
|
38
39
|
- Rakefile
|
39
40
|
- TODO
|
40
41
|
- VERSION
|
41
42
|
- bin/hadoop-hudson.sh
|
42
43
|
- bin/hadoop-ruby.sh
|
44
|
+
- bin/hrd
|
43
45
|
- conf/hadoop-site.xml
|
44
46
|
- examples/apachelog-v2-2.rb
|
45
47
|
- examples/apachelog-v2.rb
|
@@ -47,11 +49,11 @@ files:
|
|
47
49
|
- examples/hive_like_test.rb
|
48
50
|
- examples/word_count_test.rb
|
49
51
|
- hadoop-rubydsl.gemspec
|
52
|
+
- lib/client.rb
|
50
53
|
- lib/core.rb
|
54
|
+
- lib/dsl_init.rb
|
55
|
+
- lib/hadoop-dsl.rb
|
51
56
|
- lib/hive_like.rb
|
52
|
-
- lib/init.rb
|
53
|
-
- lib/java/.gitignore
|
54
|
-
- lib/java/hadoop-ruby.jar
|
55
57
|
- lib/log_analysis.rb
|
56
58
|
- lib/mapred_factory.rb
|
57
59
|
- lib/util.rb
|
@@ -86,14 +88,15 @@ specification_version: 3
|
|
86
88
|
summary: Hadoop Ruby DSL
|
87
89
|
test_files:
|
88
90
|
- spec/spec_helper.rb
|
91
|
+
- spec/dsl_init_spec.rb
|
89
92
|
- spec/core_spec.rb
|
93
|
+
- spec/client_spec.rb
|
90
94
|
- spec/util_spec.rb
|
91
95
|
- spec/mapred_factory_spec.rb
|
92
96
|
- spec/word_count_spec.rb
|
93
97
|
- spec/hive_like_spec.rb
|
94
98
|
- spec/log_analysis_spec.rb
|
95
99
|
- spec/example_spec.rb
|
96
|
-
- spec/init_spec.rb
|
97
100
|
- examples/apachelog-v2.rb
|
98
101
|
- examples/hive_like_test.rb
|
99
102
|
- examples/word_count_test.rb
|
data/README
DELETED
@@ -1,53 +0,0 @@
|
|
1
|
-
= hadoop-rubydsl
|
2
|
-
|
3
|
-
== Description
|
4
|
-
HadoopのMapper/ReducerをRubyによるDSLで記述することができます。
|
5
|
-
hadoop-ruby.jarを利用します。
|
6
|
-
|
7
|
-
例)
|
8
|
-
apachelog.rb
|
9
|
-
|
10
|
-
# log:
|
11
|
-
# 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
|
12
|
-
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
|
13
|
-
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
|
14
|
-
|
15
|
-
use 'LogAnalysis'
|
16
|
-
data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
|
17
|
-
column[2].count_uniq
|
18
|
-
column[3].count_uniq
|
19
|
-
column[4].count_uniq
|
20
|
-
column[5].count_uniq
|
21
|
-
column[6].sum
|
22
|
-
|
23
|
-
=>
|
24
|
-
col2 frank 1
|
25
|
-
col2 frank2 2
|
26
|
-
col3 [10/Oct/2000:13:55:36 -0700] 3
|
27
|
-
col4 "GET /apache_pb.gif HTTP/1.0" 1
|
28
|
-
col4 "GET /apache_pb2.gif HTTP/1.0" 1
|
29
|
-
col4 "GET /apache_pb3.gif HTTP/1.0" 1
|
30
|
-
col5 200 2
|
31
|
-
col5 404 1
|
32
|
-
col6 6978
|
33
|
-
|
34
|
-
== Usage
|
35
|
-
0. HADOOP_HOMEを正しく設定し、Hadoopを一式立ち上げておく。
|
36
|
-
|
37
|
-
1. jruby-complete-*.jar を lib/java 以下にコピー
|
38
|
-
ex)
|
39
|
-
$ wget http://jruby.kenai.com/downloads/1.4.0RC2/jruby-complete-1.4.0RC2.jar
|
40
|
-
$ cp jruby-complete-*.jar lib/java/
|
41
|
-
|
42
|
-
2. データを HDFS にアップロード
|
43
|
-
ex)
|
44
|
-
$ hadoop dfs -copyFromLocal apachelog inputs/
|
45
|
-
|
46
|
-
3. MapReduce実行
|
47
|
-
$ bin/hadoop-ruby.sh examples/apachelog.rb inputs outputs
|
48
|
-
|
49
|
-
== Author
|
50
|
-
Koichi Fujikawa <fujibee@gmail.com>
|
51
|
-
|
52
|
-
== Copyright
|
53
|
-
License: Apache License
|
data/lib/java/.gitignore
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
jruby-complete-*.jar
|
data/lib/java/hadoop-ruby.jar
DELETED
Binary file
|