hadoop-rubydsl 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +58 -0
- data/VERSION +1 -1
- data/bin/hrd +5 -0
- data/hadoop-rubydsl.gemspec +11 -9
- data/lib/client.rb +27 -0
- data/lib/{init.rb → dsl_init.rb} +4 -22
- data/lib/hadoop-dsl.rb +12 -0
- data/lib/util.rb +12 -1
- data/spec/client_spec.rb +26 -0
- data/spec/core_spec.rb +1 -1
- data/spec/{init_spec.rb → dsl_init_spec.rb} +6 -13
- data/spec/hive_like_spec.rb +0 -1
- data/spec/log_analysis_spec.rb +0 -1
- data/spec/util_spec.rb +4 -0
- data/spec/word_count_spec.rb +0 -1
- metadata +11 -8
- data/README +0 -53
- data/lib/java/.gitignore +0 -1
- data/lib/java/hadoop-ruby.jar +0 -0
data/README.rdoc
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
= hadoop-rubydsl
|
2
|
+
|
3
|
+
Enable to run Ruby DSL script on your Hadoop.
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
You can write DSL by Ruby to run Hadoop as Mapper / Reducer.
|
8
|
+
This gem depends on 'jruby-on-hadoop' project.
|
9
|
+
|
10
|
+
== Install
|
11
|
+
|
12
|
+
Required gems are all on GemCutter.
|
13
|
+
|
14
|
+
1. Upgrade your rubygem to 1.3.5
|
15
|
+
2. Install gems
|
16
|
+
$ gem install hadoop-rubydsl
|
17
|
+
|
18
|
+
== Usage
|
19
|
+
|
20
|
+
1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
|
21
|
+
2. put files into your hdfs. ex) wc/inputs/file1
|
22
|
+
3. Now you can run 'hrd' like below:
|
23
|
+
$ hrd examples/word_count_test.rb
|
24
|
+
You can get Hadoop job results in your hdfs wc/outputs/part-*
|
25
|
+
|
26
|
+
== Examples
|
27
|
+
|
28
|
+
Word Count DSL script
|
29
|
+
use 'WordCount'
|
30
|
+
|
31
|
+
from 'wc/inputs'
|
32
|
+
to 'wc/outputs'
|
33
|
+
|
34
|
+
count_uniq
|
35
|
+
total :bytes, :words, :lines
|
36
|
+
|
37
|
+
Log Analysis DSL script
|
38
|
+
use 'LogAnalysis'
|
39
|
+
|
40
|
+
data 'apache log on test2' do
|
41
|
+
from 'apachelog/inputs'
|
42
|
+
to 'apachelog/outputs'
|
43
|
+
|
44
|
+
each_line do
|
45
|
+
pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
|
46
|
+
column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
|
47
|
+
|
48
|
+
topic 'ua counts', :label => 'ua' do
|
49
|
+
count_uniq column[:ua]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
== Author
|
55
|
+
Koichi Fujikawa <fujibee@gmail.com>
|
56
|
+
|
57
|
+
== Copyright
|
58
|
+
License: Apache License
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.3
|
data/bin/hrd
ADDED
data/hadoop-rubydsl.gemspec
CHANGED
@@ -5,26 +5,27 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{hadoop-rubydsl}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Koichi Fujikawa"]
|
12
|
-
s.date = %q{
|
12
|
+
s.date = %q{2010-01-04}
|
13
13
|
s.description = %q{Hadoop Ruby DSL}
|
14
14
|
s.email = %q{fujibee@gmail.com}
|
15
|
-
s.executables = ["hadoop-hudson.sh", "hadoop-ruby.sh"]
|
15
|
+
s.executables = ["hrd", "hadoop-hudson.sh", "hadoop-ruby.sh"]
|
16
16
|
s.extra_rdoc_files = [
|
17
|
-
"README",
|
17
|
+
"README.rdoc",
|
18
18
|
"TODO"
|
19
19
|
]
|
20
20
|
s.files = [
|
21
21
|
".gitignore",
|
22
|
-
"README",
|
22
|
+
"README.rdoc",
|
23
23
|
"Rakefile",
|
24
24
|
"TODO",
|
25
25
|
"VERSION",
|
26
26
|
"bin/hadoop-hudson.sh",
|
27
27
|
"bin/hadoop-ruby.sh",
|
28
|
+
"bin/hrd",
|
28
29
|
"conf/hadoop-site.xml",
|
29
30
|
"examples/apachelog-v2-2.rb",
|
30
31
|
"examples/apachelog-v2.rb",
|
@@ -32,11 +33,11 @@ Gem::Specification.new do |s|
|
|
32
33
|
"examples/hive_like_test.rb",
|
33
34
|
"examples/word_count_test.rb",
|
34
35
|
"hadoop-rubydsl.gemspec",
|
36
|
+
"lib/client.rb",
|
35
37
|
"lib/core.rb",
|
38
|
+
"lib/dsl_init.rb",
|
39
|
+
"lib/hadoop-dsl.rb",
|
36
40
|
"lib/hive_like.rb",
|
37
|
-
"lib/init.rb",
|
38
|
-
"lib/java/.gitignore",
|
39
|
-
"lib/java/hadoop-ruby.jar",
|
40
41
|
"lib/log_analysis.rb",
|
41
42
|
"lib/mapred_factory.rb",
|
42
43
|
"lib/util.rb",
|
@@ -49,14 +50,15 @@ Gem::Specification.new do |s|
|
|
49
50
|
s.summary = %q{Hadoop Ruby DSL}
|
50
51
|
s.test_files = [
|
51
52
|
"spec/spec_helper.rb",
|
53
|
+
"spec/dsl_init_spec.rb",
|
52
54
|
"spec/core_spec.rb",
|
55
|
+
"spec/client_spec.rb",
|
53
56
|
"spec/util_spec.rb",
|
54
57
|
"spec/mapred_factory_spec.rb",
|
55
58
|
"spec/word_count_spec.rb",
|
56
59
|
"spec/hive_like_spec.rb",
|
57
60
|
"spec/log_analysis_spec.rb",
|
58
61
|
"spec/example_spec.rb",
|
59
|
-
"spec/init_spec.rb",
|
60
62
|
"examples/apachelog-v2.rb",
|
61
63
|
"examples/hive_like_test.rb",
|
62
64
|
"examples/word_count_test.rb",
|
data/lib/client.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
module HadoopDsl
|
2
|
+
class Client < JRubyOnHadoop::Client
|
3
|
+
def parse_args
|
4
|
+
super
|
5
|
+
@script_path = HadoopDsl.dsl_init_script
|
6
|
+
@script = File.basename(@script_path)
|
7
|
+
@dsl_file_path = @args[0]
|
8
|
+
@dsl_file = File.basename(@dsl_file_path)
|
9
|
+
@files << @script_path << @dsl_file_path
|
10
|
+
|
11
|
+
# TODO move properly, with jruby-on-hadoop
|
12
|
+
add_dsl_lib_files
|
13
|
+
ENV['RUBYLIB'] = File.dirname(@dsl_file_path)
|
14
|
+
end
|
15
|
+
|
16
|
+
def mapred_args
|
17
|
+
args = super
|
18
|
+
args += " --dslfile #{@dsl_file}"
|
19
|
+
args
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_dsl_lib_files
|
23
|
+
lib_path = HadoopDsl.lib_path
|
24
|
+
@files += Dir.glob(File.join(lib_path, "*.rb"))
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/{init.rb → dsl_init.rb}
RENAMED
@@ -12,15 +12,14 @@ HadoopDsl::Text = Text
|
|
12
12
|
HadoopDsl::IntWritable = IntWritable
|
13
13
|
|
14
14
|
def map(key, value, output, reporter, script)
|
15
|
-
mapper = MapperFactory.create(script, key
|
15
|
+
mapper = MapperFactory.create(script, key, value)
|
16
16
|
mapper.run
|
17
17
|
|
18
18
|
write(output, mapper)
|
19
19
|
end
|
20
20
|
|
21
21
|
def reduce(key, values, output, reporter, script)
|
22
|
-
|
23
|
-
reducer = ReducerFactory.create(script, key.to_string, ruby_values)
|
22
|
+
reducer = ReducerFactory.create(script, key, values)
|
24
23
|
reducer.run
|
25
24
|
|
26
25
|
write(output, reducer)
|
@@ -29,8 +28,7 @@ end
|
|
29
28
|
def setup(conf, script)
|
30
29
|
setup = SetupFactory.create(script, conf)
|
31
30
|
setup.run
|
32
|
-
|
33
|
-
setup.paths.to_java
|
31
|
+
setup.paths
|
34
32
|
end
|
35
33
|
|
36
34
|
private
|
@@ -38,23 +36,7 @@ private
|
|
38
36
|
def write(output, controller)
|
39
37
|
controller.emitted.each do |e|
|
40
38
|
e.each do |k, v|
|
41
|
-
output.collect(
|
39
|
+
output.collect(k, v)
|
42
40
|
end
|
43
41
|
end
|
44
42
|
end
|
45
|
-
|
46
|
-
def to_ruby(value)
|
47
|
-
case value
|
48
|
-
when IntWritable then value.get
|
49
|
-
when Text then value.to_string
|
50
|
-
else raise "no match class: #{value.class}"
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def to_hadoop(value)
|
55
|
-
case value
|
56
|
-
when Integer then IntWritable.new(value)
|
57
|
-
when String then t = Text.new; t.set(value); t
|
58
|
-
else raise "no match class: #{value.class}"
|
59
|
-
end
|
60
|
-
end
|
data/lib/hadoop-dsl.rb
ADDED
data/lib/util.rb
CHANGED
@@ -6,6 +6,17 @@ module HadoopDsl
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def read_file(file_name)
|
9
|
-
|
9
|
+
# read as usual
|
10
|
+
body = File.open(file_name).read rescue nil
|
11
|
+
return body if body
|
12
|
+
|
13
|
+
# read from loadpath
|
14
|
+
$:.each do |path|
|
15
|
+
p path
|
16
|
+
body = File.open(File.join(path, file_name)).read rescue next
|
17
|
+
return body if body
|
18
|
+
end
|
19
|
+
|
20
|
+
raise "cannot find file - #{file_name}"
|
10
21
|
end
|
11
22
|
end
|
data/spec/client_spec.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'hadoop-dsl'
|
2
|
+
|
3
|
+
describe HadoopDsl::Client do
|
4
|
+
before do
|
5
|
+
@client = HadoopDsl::Client.new(["examples/wordcount.rb", "in", "out"])
|
6
|
+
end
|
7
|
+
|
8
|
+
it 'can parse args' do
|
9
|
+
@client.files.join.should match /ruby_wrapper\.rb/
|
10
|
+
@client.files.join.should match /dsl_init\.rb/
|
11
|
+
@client.files.should include 'examples/wordcount.rb'
|
12
|
+
@client.inputs.should == 'in'
|
13
|
+
@client.outputs.should == 'out'
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'can add dsl file into mapred args' do
|
17
|
+
@client.mapred_args.should ==
|
18
|
+
"--script dsl_init.rb in out --dslfile wordcount.rb"
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'can add dsl lib files' do
|
22
|
+
lib_path = HadoopDsl.lib_path
|
23
|
+
@client.files.should include File.join(lib_path, 'core.rb')
|
24
|
+
@client.files.should include File.join(lib_path, 'log_analysis.rb')
|
25
|
+
end
|
26
|
+
end
|
data/spec/core_spec.rb
CHANGED
@@ -1,9 +1,4 @@
|
|
1
|
-
require '
|
2
|
-
require 'init'
|
3
|
-
|
4
|
-
import 'org.apache.hadoop.io.IntWritable'
|
5
|
-
import 'org.apache.hadoop.io.Text'
|
6
|
-
import 'org.apache.hadoop.mapred.JobConf'
|
1
|
+
require 'dsl_init'
|
7
2
|
|
8
3
|
describe 'mapreduce init' do
|
9
4
|
|
@@ -24,22 +19,20 @@ end
|
|
24
19
|
end
|
25
20
|
|
26
21
|
before do
|
27
|
-
@one =
|
22
|
+
@one = 1
|
28
23
|
@output = mock('output')
|
29
24
|
end
|
30
25
|
|
31
26
|
it 'can map sucessfully' do
|
32
|
-
key
|
33
|
-
|
34
|
-
value.set('it should be fine')
|
27
|
+
key = 'key'
|
28
|
+
value = 'it should be fine'
|
35
29
|
@output.should_receive(:collect).once #.with(@text, @one)
|
36
30
|
|
37
31
|
map(key, value, @output, nil, @script)
|
38
32
|
end
|
39
33
|
|
40
34
|
it 'can reduce sucessfully' do
|
41
|
-
key
|
42
|
-
key.set("t1\tkey")
|
35
|
+
key = "t1\tkey"
|
43
36
|
values = [@one, @one, @one]
|
44
37
|
@output.should_receive(:collect).once #.with(@text, @one)
|
45
38
|
|
@@ -47,7 +40,7 @@ end
|
|
47
40
|
end
|
48
41
|
|
49
42
|
it 'can set job conf' do
|
50
|
-
conf =
|
43
|
+
conf = mock('jobconf')
|
51
44
|
paths = setup(conf, @script)
|
52
45
|
|
53
46
|
paths[0].should == 'test/inputs'
|
data/spec/hive_like_spec.rb
CHANGED
data/spec/log_analysis_spec.rb
CHANGED
data/spec/util_spec.rb
CHANGED
data/spec/word_count_spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hadoop-rubydsl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Koichi Fujikawa
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2010-01-04 00:00:00 +09:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -25,21 +25,23 @@ dependencies:
|
|
25
25
|
description: Hadoop Ruby DSL
|
26
26
|
email: fujibee@gmail.com
|
27
27
|
executables:
|
28
|
+
- hrd
|
28
29
|
- hadoop-hudson.sh
|
29
30
|
- hadoop-ruby.sh
|
30
31
|
extensions: []
|
31
32
|
|
32
33
|
extra_rdoc_files:
|
33
|
-
- README
|
34
|
+
- README.rdoc
|
34
35
|
- TODO
|
35
36
|
files:
|
36
37
|
- .gitignore
|
37
|
-
- README
|
38
|
+
- README.rdoc
|
38
39
|
- Rakefile
|
39
40
|
- TODO
|
40
41
|
- VERSION
|
41
42
|
- bin/hadoop-hudson.sh
|
42
43
|
- bin/hadoop-ruby.sh
|
44
|
+
- bin/hrd
|
43
45
|
- conf/hadoop-site.xml
|
44
46
|
- examples/apachelog-v2-2.rb
|
45
47
|
- examples/apachelog-v2.rb
|
@@ -47,11 +49,11 @@ files:
|
|
47
49
|
- examples/hive_like_test.rb
|
48
50
|
- examples/word_count_test.rb
|
49
51
|
- hadoop-rubydsl.gemspec
|
52
|
+
- lib/client.rb
|
50
53
|
- lib/core.rb
|
54
|
+
- lib/dsl_init.rb
|
55
|
+
- lib/hadoop-dsl.rb
|
51
56
|
- lib/hive_like.rb
|
52
|
-
- lib/init.rb
|
53
|
-
- lib/java/.gitignore
|
54
|
-
- lib/java/hadoop-ruby.jar
|
55
57
|
- lib/log_analysis.rb
|
56
58
|
- lib/mapred_factory.rb
|
57
59
|
- lib/util.rb
|
@@ -86,14 +88,15 @@ specification_version: 3
|
|
86
88
|
summary: Hadoop Ruby DSL
|
87
89
|
test_files:
|
88
90
|
- spec/spec_helper.rb
|
91
|
+
- spec/dsl_init_spec.rb
|
89
92
|
- spec/core_spec.rb
|
93
|
+
- spec/client_spec.rb
|
90
94
|
- spec/util_spec.rb
|
91
95
|
- spec/mapred_factory_spec.rb
|
92
96
|
- spec/word_count_spec.rb
|
93
97
|
- spec/hive_like_spec.rb
|
94
98
|
- spec/log_analysis_spec.rb
|
95
99
|
- spec/example_spec.rb
|
96
|
-
- spec/init_spec.rb
|
97
100
|
- examples/apachelog-v2.rb
|
98
101
|
- examples/hive_like_test.rb
|
99
102
|
- examples/word_count_test.rb
|
data/README
DELETED
@@ -1,53 +0,0 @@
|
|
1
|
-
= hadoop-rubydsl
|
2
|
-
|
3
|
-
== Description
|
4
|
-
HadoopのMapper/ReducerをRubyによるDSLで記述することができます。
|
5
|
-
hadoop-ruby.jarを利用します。
|
6
|
-
|
7
|
-
例)
|
8
|
-
apachelog.rb
|
9
|
-
|
10
|
-
# log:
|
11
|
-
# 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
|
12
|
-
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
|
13
|
-
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
|
14
|
-
|
15
|
-
use 'LogAnalysis'
|
16
|
-
data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
|
17
|
-
column[2].count_uniq
|
18
|
-
column[3].count_uniq
|
19
|
-
column[4].count_uniq
|
20
|
-
column[5].count_uniq
|
21
|
-
column[6].sum
|
22
|
-
|
23
|
-
=>
|
24
|
-
col2 frank 1
|
25
|
-
col2 frank2 2
|
26
|
-
col3 [10/Oct/2000:13:55:36 -0700] 3
|
27
|
-
col4 "GET /apache_pb.gif HTTP/1.0" 1
|
28
|
-
col4 "GET /apache_pb2.gif HTTP/1.0" 1
|
29
|
-
col4 "GET /apache_pb3.gif HTTP/1.0" 1
|
30
|
-
col5 200 2
|
31
|
-
col5 404 1
|
32
|
-
col6 6978
|
33
|
-
|
34
|
-
== Usage
|
35
|
-
0. HADOOP_HOMEを正しく設定し、Hadoopを一式立ち上げておく。
|
36
|
-
|
37
|
-
1. jruby-complete-*.jar を lib/java 以下にコピー
|
38
|
-
ex)
|
39
|
-
$ wget http://jruby.kenai.com/downloads/1.4.0RC2/jruby-complete-1.4.0RC2.jar
|
40
|
-
$ cp jruby-complete-*.jar lib/java/
|
41
|
-
|
42
|
-
2. データを HDFS にアップロード
|
43
|
-
ex)
|
44
|
-
$ hadoop dfs -copyFromLocal apachelog inputs/
|
45
|
-
|
46
|
-
3. MapReduce実行
|
47
|
-
$ bin/hadoop-ruby.sh examples/apachelog.rb inputs outputs
|
48
|
-
|
49
|
-
== Author
|
50
|
-
Koichi Fujikawa <fujibee@gmail.com>
|
51
|
-
|
52
|
-
== Copyright
|
53
|
-
License: Apache License
|
data/lib/java/.gitignore
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
jruby-complete-*.jar
|
data/lib/java/hadoop-ruby.jar
DELETED
Binary file
|