trafficbroker-mandy 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +11 -0
- data/bin/mandy-hadoop +20 -0
- data/bin/mandy-local +11 -0
- data/bin/mandy-map +11 -0
- data/bin/mandy-put +11 -0
- data/bin/mandy-reduce +11 -0
- data/lib/array_serializer.rb +22 -0
- data/lib/dsl.rb +16 -0
- data/lib/job.rb +32 -0
- data/lib/mandy.rb +1 -0
- data/lib/mapper.rb +40 -0
- data/lib/reducer.rb +44 -0
- data/lib/test_runner.rb +38 -0
- data/lib/tuple.rb +24 -0
- data/readme.md +9 -0
- metadata +72 -0
data/Rakefile
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require "rake"
|
3
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'lib', 'mandy'))
|
4
|
+
require 'spec/rake/spectask'
|
5
|
+
|
6
|
+
task :default => :spec
|
7
|
+
|
8
|
+
Spec::Rake::SpecTask.new(:spec) do |t|
|
9
|
+
t.spec_files = FileList['spec/lib/**/*_spec.rb']
|
10
|
+
t.spec_opts = %w{-f s -c -L mtime}
|
11
|
+
end
|
data/bin/mandy-hadoop
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
def absolute_path(path)
|
4
|
+
path =~ /^\// ? path : File.join(Dir.pwd, path)
|
5
|
+
end
|
6
|
+
|
7
|
+
file = absolute_path(ARGV[0])
|
8
|
+
input = ARGV[1]
|
9
|
+
output = ARGV[2]
|
10
|
+
config = ARGV[3]
|
11
|
+
|
12
|
+
require file
|
13
|
+
|
14
|
+
`$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-*-streaming.jar \
|
15
|
+
-additionalconfspec "#{config}" \
|
16
|
+
-input "#{input}" \
|
17
|
+
-mapper "mandy-map #{file}" \
|
18
|
+
-reducer "mandy-reduce #{file}" \
|
19
|
+
-file "#{file}" \
|
20
|
+
-output "#{output}"`
|
data/bin/mandy-local
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
def absolute_path(path)
|
4
|
+
path =~ /^\// ? path : File.join(Dir.pwd, path)
|
5
|
+
end
|
6
|
+
|
7
|
+
file = absolute_path(ARGV[0])
|
8
|
+
input = absolute_path(ARGV[1])
|
9
|
+
output = absolute_path(ARGV[2])
|
10
|
+
|
11
|
+
`cat #{input} | mandy-map #{file} | sort | mandy-reduce #{file} > #{output}`
|
data/bin/mandy-map
ADDED
data/bin/mandy-put
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
def absolute_path(path)
|
4
|
+
path =~ /^\// ? path : File.join(Dir.pwd, path)
|
5
|
+
end
|
6
|
+
|
7
|
+
source = absolute_path(ARGV[0])
|
8
|
+
dest = ARGV[1]
|
9
|
+
config = absolute_path(ARGV[2])
|
10
|
+
|
11
|
+
`$HADOOP_HOME/bin/hadoop fs -conf #{config} -copyFromLocal #{source} #{dest}`
|
data/bin/mandy-reduce
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Mandy
|
2
|
+
class ArraySerializer
|
3
|
+
|
4
|
+
SEPERATOR = '|' unless defined?(SEPERATOR)
|
5
|
+
|
6
|
+
def initialize(items)
|
7
|
+
@items = items || []
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_s
|
11
|
+
@items.join(SEPERATOR)
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.from_s(str)
|
15
|
+
str.split(SEPERATOR)
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.tuples_from_s(str)
|
19
|
+
from_s(str).map {|s| Tuple.from_s(s) }
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/dsl.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
module Mandy
|
2
|
+
module DSL
|
3
|
+
|
4
|
+
def self.included(klass)
|
5
|
+
Mandy::Job.default = Mandy::Job.new('Untitled Job')
|
6
|
+
end
|
7
|
+
|
8
|
+
def map(&blk)
|
9
|
+
Mandy::Job.default.map(&blk)
|
10
|
+
end
|
11
|
+
|
12
|
+
def reduce(&blk)
|
13
|
+
Mandy::Job.default.reduce(&blk)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/job.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
module Mandy
|
2
|
+
class Job
|
3
|
+
class << self
|
4
|
+
attr_accessor :default
|
5
|
+
end
|
6
|
+
|
7
|
+
def initialize(name, &blk)
|
8
|
+
@name = name
|
9
|
+
instance_eval(&blk) if blk
|
10
|
+
end
|
11
|
+
|
12
|
+
def map(&blk)
|
13
|
+
@mapper_class = Mandy::Mapper.compile(&blk)
|
14
|
+
end
|
15
|
+
|
16
|
+
def reduce(&blk)
|
17
|
+
@reducer_class = Mandy::Reducer.compile(&blk)
|
18
|
+
end
|
19
|
+
|
20
|
+
def run_map(input=STDIN, output=STDOUT, &blk)
|
21
|
+
mapper = @mapper_class.new(input, output)
|
22
|
+
yield(mapper) if blk
|
23
|
+
mapper.execute
|
24
|
+
end
|
25
|
+
|
26
|
+
def run_reduce(input=STDIN, output=STDOUT, &blk)
|
27
|
+
reducer = @reducer_class.new(input, output)
|
28
|
+
yield(reducer) if blk
|
29
|
+
reducer.execute
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/lib/mandy.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
%w(tuple array_serializer mapper reducer dsl job test_runner).each {|file| require File.join(File.dirname(__FILE__), file) }
|
data/lib/mapper.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
module Mandy
|
2
|
+
class Mapper
|
3
|
+
|
4
|
+
KEY_VALUE_SEPERATOR = "\t" unless defined?(KEY_VALUE_SEPERATOR)
|
5
|
+
|
6
|
+
def initialize(input=STDIN, output=STDOUT)
|
7
|
+
@input, @output = input, output
|
8
|
+
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.compile(&blk)
|
12
|
+
Class.new(Mandy::Mapper) do
|
13
|
+
self.class_eval do
|
14
|
+
define_method(:mapper, blk) if blk
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def execute
|
20
|
+
@input.each_line do |line|
|
21
|
+
key, value = line.split(KEY_VALUE_SEPERATOR)
|
22
|
+
key, value = nil, key if value.nil?
|
23
|
+
value.chomp!
|
24
|
+
mapper(key, value)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def emit(key, value=nil)
|
29
|
+
key = 'nil' if key.nil?
|
30
|
+
@output.puts(value.nil? ? key.to_s : "#{key}\t#{value}")
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def mapper(key,value)
|
36
|
+
# default map is simply a pass-through
|
37
|
+
emit(key, value)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/reducer.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
module Mandy
|
2
|
+
class Reducer
|
3
|
+
|
4
|
+
KEY_VALUE_SEPERATOR = "\t" unless defined?(KEY_VALUE_SEPERATOR)
|
5
|
+
|
6
|
+
def initialize(input=STDIN, output=STDOUT)
|
7
|
+
@input, @output = input, output
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.compile(&blk)
|
11
|
+
Class.new(Mandy::Reducer) do
|
12
|
+
self.class_eval do
|
13
|
+
define_method(:reducer, blk) if blk
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def execute
|
19
|
+
last_key, values = nil, []
|
20
|
+
@input.each_line do |line|
|
21
|
+
key, value = line.split(KEY_VALUE_SEPERATOR)
|
22
|
+
value.chomp!
|
23
|
+
last_key = key if last_key.nil?
|
24
|
+
if key != last_key
|
25
|
+
reducer(last_key, values)
|
26
|
+
last_key, values = key, []
|
27
|
+
end
|
28
|
+
values << value
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def emit(key, value=nil)
|
33
|
+
key = 'nil' if key.nil?
|
34
|
+
@output.puts(value.nil? ? key.to_s : "#{key}\t#{value}")
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def reducer(key,value)
|
40
|
+
# default reducer is simply a pass-through
|
41
|
+
emit(key, value)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
data/lib/test_runner.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
module Mandy
|
2
|
+
class TestRunner
|
3
|
+
def initialize(job=Mandy::Job.default)
|
4
|
+
@job = job
|
5
|
+
end
|
6
|
+
|
7
|
+
def map(input, output_stream=StringIO.new(''), &blk)
|
8
|
+
input = input_from_array(input) if input.is_a?(Array)
|
9
|
+
input_stream = StringIO.new(input.to_s)
|
10
|
+
@job.run_map(input_stream, output_stream, &blk)
|
11
|
+
output_stream.rewind
|
12
|
+
output_stream
|
13
|
+
end
|
14
|
+
|
15
|
+
def reduce(input, output_stream=StringIO.new(''), &blk)
|
16
|
+
input = input_from_hash(input) if input.is_a?(Hash)
|
17
|
+
input_stream = StringIO.new(input.to_s)
|
18
|
+
@job.run_reduce(input_stream, output_stream, &blk)
|
19
|
+
output_stream.rewind
|
20
|
+
output_stream
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def input_from_array(input)
|
26
|
+
input.join("\n")
|
27
|
+
end
|
28
|
+
|
29
|
+
def input_from_hash(input)
|
30
|
+
output = []
|
31
|
+
input.each do |key, values|
|
32
|
+
output << "#{key}\t#{values}" and next unless values.is_a?(Array)
|
33
|
+
values.each { |value| output << "#{key}\t#{value}" }
|
34
|
+
end
|
35
|
+
input_from_array(output.sort)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/tuple.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module Mandy
|
2
|
+
class Tuple
|
3
|
+
|
4
|
+
SEPERATOR = ',' unless defined?(SEPERATOR)
|
5
|
+
|
6
|
+
attr_accessor :name, :value
|
7
|
+
|
8
|
+
def initialize(name, value)
|
9
|
+
@name, @value = name, value
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_s
|
13
|
+
%(#{@name}#{SEPERATOR}#{@value})
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.from_s(str)
|
17
|
+
new(*str.split(SEPERATOR))
|
18
|
+
end
|
19
|
+
|
20
|
+
def ==(other)
|
21
|
+
self.name == other.name && self.value == other.value
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/readme.md
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
Mandy - Simplified Hadoop distribution for Ruby code
|
2
|
+
====================================================
|
3
|
+
|
4
|
+
Mandy hides the differences and complexities between running map/reduce tasks locally or distributed or in test environments.
|
5
|
+
|
6
|
+
It provides a simple DSL to define new jobs for distribution. See examples/word_count.rb for a very simple demo.
|
7
|
+
Run the word count example locally with...
|
8
|
+
|
9
|
+
bin/mandy local examples/word_count.rb examples/alice.txt examples/output.txt
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: trafficbroker-mandy
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andy Kent
|
8
|
+
- Paul Ingles
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2009-07-09 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies: []
|
16
|
+
|
17
|
+
description: Map/Reduce
|
18
|
+
email: andy.kent@me.com
|
19
|
+
executables:
|
20
|
+
- mandy-hadoop
|
21
|
+
- mandy-local
|
22
|
+
- mandy-map
|
23
|
+
- mandy-put
|
24
|
+
- mandy-reduce
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- bin/mandy-hadoop
|
31
|
+
- bin/mandy-local
|
32
|
+
- bin/mandy-map
|
33
|
+
- bin/mandy-put
|
34
|
+
- bin/mandy-reduce
|
35
|
+
- readme.md
|
36
|
+
- Rakefile
|
37
|
+
- lib/mandy.rb
|
38
|
+
- lib/array_serializer.rb
|
39
|
+
- lib/dsl.rb
|
40
|
+
- lib/job.rb
|
41
|
+
- lib/mapper.rb
|
42
|
+
- lib/reducer.rb
|
43
|
+
- lib/test_runner.rb
|
44
|
+
- lib/tuple.rb
|
45
|
+
has_rdoc: false
|
46
|
+
homepage:
|
47
|
+
post_install_message:
|
48
|
+
rdoc_options: []
|
49
|
+
|
50
|
+
require_paths:
|
51
|
+
- lib
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: "0"
|
57
|
+
version:
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: "0"
|
63
|
+
version:
|
64
|
+
requirements: []
|
65
|
+
|
66
|
+
rubyforge_project:
|
67
|
+
rubygems_version: 1.2.0
|
68
|
+
signing_key:
|
69
|
+
specification_version: 2
|
70
|
+
summary: Map/Reduce
|
71
|
+
test_files: []
|
72
|
+
|