sandofsky-ralph 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +23 -0
- data/History.txt +6 -0
- data/Manifest.txt +9 -0
- data/README.txt +76 -0
- data/Rakefile +13 -0
- data/bin/ralph +60 -0
- data/lib/iterator.rb +67 -0
- data/lib/ralph.rb +59 -0
- data/test/test_ralph.rb +8 -0
- metadata +105 -0
data/.autotest
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'autotest/restart'
|
4
|
+
|
5
|
+
# Autotest.add_hook :initialize do |at|
|
6
|
+
# at.extra_files << "../some/external/dependency.rb"
|
7
|
+
#
|
8
|
+
# at.libs << ":../some/external"
|
9
|
+
#
|
10
|
+
# at.add_exception 'vendor'
|
11
|
+
#
|
12
|
+
# at.add_mapping(/dependency.rb/) do |f, _|
|
13
|
+
# at.files_matching(/test_.*rb$/)
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# %w(TestA TestB).each do |klass|
|
17
|
+
# at.extra_class_map[klass] = "test/test_misc.rb"
|
18
|
+
# end
|
19
|
+
# end
|
20
|
+
|
21
|
+
# Autotest.add_hook :run_command do |at|
|
22
|
+
# system "rake build"
|
23
|
+
# end
|
data/History.txt
ADDED
data/Manifest.txt
ADDED
data/README.txt
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
= ralph
|
2
|
+
|
3
|
+
* http://github.com/sandofsky/ralph
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
A pretty, simple wrapper for writing Hadoop Streaming jobs in Ruby.
|
8
|
+
|
9
|
+
== FEATURES/PROBLEMS:
|
10
|
+
|
11
|
+
* First version.
|
12
|
+
|
13
|
+
== SYNOPSIS:
|
14
|
+
|
15
|
+
Define your mappers and reducers in Ruby with the help of a domain specific language.
|
16
|
+
|
17
|
+
Ralph.job do
|
18
|
+
|
19
|
+
mapper do |row|
|
20
|
+
row.split(/\s/).each do |word|
|
21
|
+
emit word, 1
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
reducer do |key, values|
|
26
|
+
total = values.inject(0) {|t, i| t + i.to_i }
|
27
|
+
emit key, total
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
Then run you task with:
|
33
|
+
|
34
|
+
ralph fire word_count.rb input.csv output
|
35
|
+
|
36
|
+
Which would translate into:
|
37
|
+
|
38
|
+
hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-*-streaming.jar \
|
39
|
+
-input input.csv \
|
40
|
+
-output output \
|
41
|
+
-mapper "ralph map word_count.rb" \
|
42
|
+
-reducer "ralph reduce word_count.rb" \
|
43
|
+
-file word_count.rb
|
44
|
+
|
45
|
+
== REQUIREMENTS:
|
46
|
+
|
47
|
+
* Make sure to export your $HADOOP_HOME as the path to hadoop.
|
48
|
+
|
49
|
+
== INSTALL:
|
50
|
+
|
51
|
+
* sudo gem install sandofsky-ralph --source http://gems.github.com
|
52
|
+
|
53
|
+
== LICENSE:
|
54
|
+
|
55
|
+
(The MIT License)
|
56
|
+
|
57
|
+
Copyright (c) 2009 FIX
|
58
|
+
|
59
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
60
|
+
a copy of this software and associated documentation files (the
|
61
|
+
'Software'), to deal in the Software without restriction, including
|
62
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
63
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
64
|
+
permit persons to whom the Software is furnished to do so, subject to
|
65
|
+
the following conditions:
|
66
|
+
|
67
|
+
The above copyright notice and this permission notice shall be
|
68
|
+
included in all copies or substantial portions of the Software.
|
69
|
+
|
70
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
71
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
72
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
73
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
74
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
75
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
76
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
|
6
|
+
Hoe.spec 'ralph' do
|
7
|
+
developer('Ben Sandofsky', 'sandofsky@gmail.com')
|
8
|
+
extra_deps << ['fastercsv', '~> 1.4.0']
|
9
|
+
extra_deps << ['sandofsky-csvscan', '>= 0.1.0']
|
10
|
+
extra_deps << ['wycats-thor', '>= 0.11.5']
|
11
|
+
end
|
12
|
+
|
13
|
+
# vim: syntax=ruby
|
data/bin/ralph
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'thor'
|
4
|
+
require File.expand_path(File.dirname(__FILE__) + "/../lib/ralph")
|
5
|
+
class RalphCLI < Thor
|
6
|
+
desc "fire <script.rb>", "run via hadooop streaming"
|
7
|
+
def fire(script, input, output = nil)
|
8
|
+
output ||= "#{input}_output"
|
9
|
+
hadoop_command = <<-END
|
10
|
+
#{ENV["HADOOP_HOME"]}/bin/hadoop jar #{ENV["HADOOP_HOME"]}/contrib/streaming/hadoop-*-streaming.jar \
|
11
|
+
-input #{input} \
|
12
|
+
-output #{output} \
|
13
|
+
-mapper "ralph map #{script}" \
|
14
|
+
-reducer "ralph reduce #{script}" \
|
15
|
+
-file #{script}
|
16
|
+
END
|
17
|
+
IO.popen(hadoop_command, 'r+') do |io|
|
18
|
+
until io.eof?
|
19
|
+
puts io.readline
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
desc "simulate <script.rb>", "simulates a mapreduce job through unix pipes and 'sort'"
|
25
|
+
def simulate(script)
|
26
|
+
require script
|
27
|
+
sorter = IO.popen("sort", 'w+')
|
28
|
+
threads = []
|
29
|
+
threads << Thread.new(sorter) do |s|
|
30
|
+
Ralph.run_mapper!(STDIN, s)
|
31
|
+
s.close_write
|
32
|
+
end
|
33
|
+
if Ralph::Job.reducer?
|
34
|
+
threads << Thread.new(sorter) do |s|
|
35
|
+
Ralph.run_reducer!(s, STDOUT)
|
36
|
+
end
|
37
|
+
else
|
38
|
+
threads << Thread.new(sorter) do |s|
|
39
|
+
until s.eof?
|
40
|
+
puts s.readline
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
threads.each {|t| t.join}
|
45
|
+
end
|
46
|
+
|
47
|
+
desc "map <script.rb>", "Run mapper"
|
48
|
+
def map(script)
|
49
|
+
require script
|
50
|
+
Ralph.run_mapper!
|
51
|
+
end
|
52
|
+
|
53
|
+
desc "reduce <script.rb>", "Run reducer"
|
54
|
+
def reduce(script)
|
55
|
+
require script
|
56
|
+
Ralph.run_reducer!
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
RalphCLI.start
|
data/lib/iterator.rb
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
class Ralph
|
2
|
+
module Iterator
|
3
|
+
|
4
|
+
@@input = STDIN
|
5
|
+
def self.input=(i)
|
6
|
+
@@input = i
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.input
|
10
|
+
@@input
|
11
|
+
end
|
12
|
+
|
13
|
+
@@last_key = nil
|
14
|
+
@@current_key = nil
|
15
|
+
@@current_value = nil
|
16
|
+
|
17
|
+
@@has_iterated = nil
|
18
|
+
def self.ready!
|
19
|
+
@@has_iterated = false
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.current_key
|
23
|
+
@@current_key
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.start_of_group?
|
27
|
+
@@last_key != @@current_key
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.next_group!
|
31
|
+
until start_of_group? || input.eof?
|
32
|
+
grab!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.each
|
37
|
+
repeat_iteration_check
|
38
|
+
c = @@current_key.dup
|
39
|
+
until c != @@current_key || input.eof?
|
40
|
+
yield @@current_value
|
41
|
+
grab!
|
42
|
+
end
|
43
|
+
@@has_iterated = true
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.inject(s = nil)
|
47
|
+
repeat_iteration_check
|
48
|
+
c = @@current_key.dup
|
49
|
+
until c != @@current_key || input.eof?
|
50
|
+
s = yield s, @@current_value
|
51
|
+
grab!
|
52
|
+
end
|
53
|
+
@@has_iterated = true
|
54
|
+
s
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.repeat_iteration_check
|
58
|
+
raise "You cannot call the value iterator more than once in a reduce" if @@has_iterated
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.grab!
|
62
|
+
@@last_key = @@current_key.dup unless @@current_key.nil?
|
63
|
+
@@current_key, @@current_value = FasterCSV.parse_line(input.readline, :col_sep => "\t")
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
data/lib/ralph.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'csvscan'
|
2
|
+
require 'fastercsv'
|
3
|
+
require File.dirname(__FILE__) + '/iterator'
|
4
|
+
class Ralph
|
5
|
+
VERSION = '0.1.0'
|
6
|
+
|
7
|
+
@@map_input = nil
|
8
|
+
class Job
|
9
|
+
attr_accessor :output
|
10
|
+
def emit(key, value)
|
11
|
+
output << FasterCSV.generate_line((value ? [key, value] : [key]), :col_sep => "\t")
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.reducer?
|
15
|
+
instance_methods.include?('reduce')
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.mapper(format = nil, &block)
|
21
|
+
Job.send(:define_method, :map, &block)
|
22
|
+
@@map_input = format
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.reducer(&block)
|
26
|
+
Job.send(:define_method, :reduce, &block)
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.run_mapper!(input = STDIN, output = STDOUT)
|
30
|
+
j = Job.new
|
31
|
+
j.output = output
|
32
|
+
if @@map_input == :csv
|
33
|
+
CSVScan.scan(input) do |row|
|
34
|
+
j.map(row)
|
35
|
+
end
|
36
|
+
else
|
37
|
+
input.each_line do |line|
|
38
|
+
j.map(line)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.run_reducer!(input = STDIN, output = STDOUT)
|
44
|
+
j = Job.new
|
45
|
+
j.output = output
|
46
|
+
Iterator.input = input
|
47
|
+
Iterator.grab! unless input.eof?
|
48
|
+
until input.eof?
|
49
|
+
Iterator.ready!
|
50
|
+
j.reduce(Iterator.current_key, Iterator)
|
51
|
+
Iterator.next_group! unless Iterator.start_of_group?
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.job(&block)
|
56
|
+
self.instance_eval(&block) if block_given?# setup
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
data/test/test_ralph.rb
ADDED
metadata
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sandofsky-ralph
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ben Sandofsky
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-08-14 00:00:00 -07:00
|
13
|
+
default_executable: ralph
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: fastercsv
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.4.0
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: sandofsky-csvscan
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.1.0
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: wycats-thor
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.11.5
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: hoe
|
47
|
+
type: :development
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 2.3.3
|
54
|
+
version:
|
55
|
+
description: A pretty, simple wrapper for writing Hadoop Streaming jobs in Ruby.
|
56
|
+
email:
|
57
|
+
- sandofsky@gmail.com
|
58
|
+
executables:
|
59
|
+
- ralph
|
60
|
+
extensions: []
|
61
|
+
|
62
|
+
extra_rdoc_files:
|
63
|
+
- History.txt
|
64
|
+
- Manifest.txt
|
65
|
+
- README.txt
|
66
|
+
files:
|
67
|
+
- .autotest
|
68
|
+
- History.txt
|
69
|
+
- Manifest.txt
|
70
|
+
- README.txt
|
71
|
+
- Rakefile
|
72
|
+
- bin/ralph
|
73
|
+
- lib/iterator.rb
|
74
|
+
- lib/ralph.rb
|
75
|
+
- test/test_ralph.rb
|
76
|
+
has_rdoc: false
|
77
|
+
homepage: http://github.com/sandofsky/ralph
|
78
|
+
licenses:
|
79
|
+
post_install_message:
|
80
|
+
rdoc_options:
|
81
|
+
- --main
|
82
|
+
- README.txt
|
83
|
+
require_paths:
|
84
|
+
- lib
|
85
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: "0"
|
90
|
+
version:
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: "0"
|
96
|
+
version:
|
97
|
+
requirements: []
|
98
|
+
|
99
|
+
rubyforge_project: ralph
|
100
|
+
rubygems_version: 1.3.5
|
101
|
+
signing_key:
|
102
|
+
specification_version: 3
|
103
|
+
summary: A pretty, simple wrapper for writing Hadoop Streaming jobs in Ruby.
|
104
|
+
test_files:
|
105
|
+
- test/test_ralph.rb
|