trafficbroker-mandy 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ require 'rubygems'
2
+ require "rake"
3
+ require File.expand_path(File.join(File.dirname(__FILE__), 'lib', 'mandy'))
4
+ require 'spec/rake/spectask'
5
+
6
+ task :default => :spec
7
+
8
+ Spec::Rake::SpecTask.new(:spec) do |t|
9
+ t.spec_files = FileList['spec/lib/**/*_spec.rb']
10
+ t.spec_opts = %w{-f s -c -L mtime}
11
+ end
data/bin/mandy-hadoop ADDED
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ def absolute_path(path)
4
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
5
+ end
6
+
7
+ file = absolute_path(ARGV[0])
8
+ input = ARGV[1]
9
+ output = ARGV[2]
10
+ config = ARGV[3]
11
+
12
+ require file
13
+
14
+ `$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-*-streaming.jar \
15
+ -additionalconfspec "#{config}" \
16
+ -input "#{input}" \
17
+ -mapper "mandy-map #{file}" \
18
+ -reducer "mandy-reduce #{file}" \
19
+ -file "#{file}" \
20
+ -output "#{output}"`
data/bin/mandy-local ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ def absolute_path(path)
4
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
5
+ end
6
+
7
+ file = absolute_path(ARGV[0])
8
+ input = absolute_path(ARGV[1])
9
+ output = absolute_path(ARGV[2])
10
+
11
+ `cat #{input} | mandy-map #{file} | sort | mandy-reduce #{file} > #{output}`
data/bin/mandy-map ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ def absolute_path(path)
4
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
5
+ end
6
+
7
+ file = absolute_path(ARGV[0])
8
+
9
+ require file
10
+
11
+ Mandy::Job.default.run_map
data/bin/mandy-put ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ def absolute_path(path)
4
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
5
+ end
6
+
7
+ source = absolute_path(ARGV[0])
8
+ dest = ARGV[1]
9
+ config = absolute_path(ARGV[2])
10
+
11
+ `$HADOOP_HOME/bin/hadoop fs -conf #{config} -copyFromLocal #{source} #{dest}`
data/bin/mandy-reduce ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ def absolute_path(path)
4
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
5
+ end
6
+
7
+ file = absolute_path(ARGV[0])
8
+
9
+ require file
10
+
11
+ Mandy::Job.default.run_reduce
@@ -0,0 +1,22 @@
1
+ module Mandy
2
+ class ArraySerializer
3
+
4
+ SEPERATOR = '|' unless defined?(SEPERATOR)
5
+
6
+ def initialize(items)
7
+ @items = items || []
8
+ end
9
+
10
+ def to_s
11
+ @items.join(SEPERATOR)
12
+ end
13
+
14
+ def self.from_s(str)
15
+ str.split(SEPERATOR)
16
+ end
17
+
18
+ def self.tuples_from_s(str)
19
+ from_s(str).map {|s| Tuple.from_s(s) }
20
+ end
21
+ end
22
+ end
data/lib/dsl.rb ADDED
@@ -0,0 +1,16 @@
1
+ module Mandy
2
+ module DSL
3
+
4
+ def self.included(klass)
5
+ Mandy::Job.default = Mandy::Job.new('Untitled Job')
6
+ end
7
+
8
+ def map(&blk)
9
+ Mandy::Job.default.map(&blk)
10
+ end
11
+
12
+ def reduce(&blk)
13
+ Mandy::Job.default.reduce(&blk)
14
+ end
15
+ end
16
+ end
data/lib/job.rb ADDED
@@ -0,0 +1,32 @@
1
+ module Mandy
2
+ class Job
3
+ class << self
4
+ attr_accessor :default
5
+ end
6
+
7
+ def initialize(name, &blk)
8
+ @name = name
9
+ instance_eval(&blk) if blk
10
+ end
11
+
12
+ def map(&blk)
13
+ @mapper_class = Mandy::Mapper.compile(&blk)
14
+ end
15
+
16
+ def reduce(&blk)
17
+ @reducer_class = Mandy::Reducer.compile(&blk)
18
+ end
19
+
20
+ def run_map(input=STDIN, output=STDOUT, &blk)
21
+ mapper = @mapper_class.new(input, output)
22
+ yield(mapper) if blk
23
+ mapper.execute
24
+ end
25
+
26
+ def run_reduce(input=STDIN, output=STDOUT, &blk)
27
+ reducer = @reducer_class.new(input, output)
28
+ yield(reducer) if blk
29
+ reducer.execute
30
+ end
31
+ end
32
+ end
data/lib/mandy.rb ADDED
@@ -0,0 +1 @@
1
+ %w(tuple array_serializer mapper reducer dsl job test_runner).each {|file| require File.join(File.dirname(__FILE__), file) }
data/lib/mapper.rb ADDED
@@ -0,0 +1,40 @@
1
+ module Mandy
2
+ class Mapper
3
+
4
+ KEY_VALUE_SEPERATOR = "\t" unless defined?(KEY_VALUE_SEPERATOR)
5
+
6
+ def initialize(input=STDIN, output=STDOUT)
7
+ @input, @output = input, output
8
+
9
+ end
10
+
11
+ def self.compile(&blk)
12
+ Class.new(Mandy::Mapper) do
13
+ self.class_eval do
14
+ define_method(:mapper, blk) if blk
15
+ end
16
+ end
17
+ end
18
+
19
+ def execute
20
+ @input.each_line do |line|
21
+ key, value = line.split(KEY_VALUE_SEPERATOR)
22
+ key, value = nil, key if value.nil?
23
+ value.chomp!
24
+ mapper(key, value)
25
+ end
26
+ end
27
+
28
+ def emit(key, value=nil)
29
+ key = 'nil' if key.nil?
30
+ @output.puts(value.nil? ? key.to_s : "#{key}\t#{value}")
31
+ end
32
+
33
+ private
34
+
35
+ def mapper(key,value)
36
+ # default map is simply a pass-through
37
+ emit(key, value)
38
+ end
39
+ end
40
+ end
data/lib/reducer.rb ADDED
@@ -0,0 +1,44 @@
1
+ module Mandy
2
+ class Reducer
3
+
4
+ KEY_VALUE_SEPERATOR = "\t" unless defined?(KEY_VALUE_SEPERATOR)
5
+
6
+ def initialize(input=STDIN, output=STDOUT)
7
+ @input, @output = input, output
8
+ end
9
+
10
+ def self.compile(&blk)
11
+ Class.new(Mandy::Reducer) do
12
+ self.class_eval do
13
+ define_method(:reducer, blk) if blk
14
+ end
15
+ end
16
+ end
17
+
18
+ def execute
19
+ last_key, values = nil, []
20
+ @input.each_line do |line|
21
+ key, value = line.split(KEY_VALUE_SEPERATOR)
22
+ value.chomp!
23
+ last_key = key if last_key.nil?
24
+ if key != last_key
25
+ reducer(last_key, values)
26
+ last_key, values = key, []
27
+ end
28
+ values << value
29
+ end
30
+ end
31
+
32
+ def emit(key, value=nil)
33
+ key = 'nil' if key.nil?
34
+ @output.puts(value.nil? ? key.to_s : "#{key}\t#{value}")
35
+ end
36
+
37
+ private
38
+
39
+ def reducer(key,value)
40
+ # default reducer is simply a pass-through
41
+ emit(key, value)
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,38 @@
1
+ module Mandy
2
+ class TestRunner
3
+ def initialize(job=Mandy::Job.default)
4
+ @job = job
5
+ end
6
+
7
+ def map(input, output_stream=StringIO.new(''), &blk)
8
+ input = input_from_array(input) if input.is_a?(Array)
9
+ input_stream = StringIO.new(input.to_s)
10
+ @job.run_map(input_stream, output_stream, &blk)
11
+ output_stream.rewind
12
+ output_stream
13
+ end
14
+
15
+ def reduce(input, output_stream=StringIO.new(''), &blk)
16
+ input = input_from_hash(input) if input.is_a?(Hash)
17
+ input_stream = StringIO.new(input.to_s)
18
+ @job.run_reduce(input_stream, output_stream, &blk)
19
+ output_stream.rewind
20
+ output_stream
21
+ end
22
+
23
+ private
24
+
25
+ def input_from_array(input)
26
+ input.join("\n")
27
+ end
28
+
29
+ def input_from_hash(input)
30
+ output = []
31
+ input.each do |key, values|
32
+ output << "#{key}\t#{values}" and next unless values.is_a?(Array)
33
+ values.each { |value| output << "#{key}\t#{value}" }
34
+ end
35
+ input_from_array(output.sort)
36
+ end
37
+ end
38
+ end
data/lib/tuple.rb ADDED
@@ -0,0 +1,24 @@
1
+ module Mandy
2
+ class Tuple
3
+
4
+ SEPERATOR = ',' unless defined?(SEPERATOR)
5
+
6
+ attr_accessor :name, :value
7
+
8
+ def initialize(name, value)
9
+ @name, @value = name, value
10
+ end
11
+
12
+ def to_s
13
+ %(#{@name}#{SEPERATOR}#{@value})
14
+ end
15
+
16
+ def self.from_s(str)
17
+ new(*str.split(SEPERATOR))
18
+ end
19
+
20
+ def ==(other)
21
+ self.name == other.name && self.value == other.value
22
+ end
23
+ end
24
+ end
data/readme.md ADDED
@@ -0,0 +1,9 @@
1
+ Mandy - Simplified Hadoop distribution for Ruby code
2
+ ====================================================
3
+
4
+ Mandy hides the differences and complexities between running map/reduce tasks locally or distributed or in test environments.
5
+
6
+ It provides a simple DSL to define new jobs for distribution. See examples/word_count.rb for a very simple demo.
7
+ Run the word count example locally with...
8
+
9
+ bin/mandy local examples/word_count.rb examples/alice.txt examples/output.txt
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: trafficbroker-mandy
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Andy Kent
8
+ - Paul Ingles
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-07-09 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description: Map/Reduce
18
+ email: andy.kent@me.com
19
+ executables:
20
+ - mandy-hadoop
21
+ - mandy-local
22
+ - mandy-map
23
+ - mandy-put
24
+ - mandy-reduce
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - bin/mandy-hadoop
31
+ - bin/mandy-local
32
+ - bin/mandy-map
33
+ - bin/mandy-put
34
+ - bin/mandy-reduce
35
+ - readme.md
36
+ - Rakefile
37
+ - lib/mandy.rb
38
+ - lib/array_serializer.rb
39
+ - lib/dsl.rb
40
+ - lib/job.rb
41
+ - lib/mapper.rb
42
+ - lib/reducer.rb
43
+ - lib/test_runner.rb
44
+ - lib/tuple.rb
45
+ has_rdoc: false
46
+ homepage:
47
+ post_install_message:
48
+ rdoc_options: []
49
+
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: "0"
57
+ version:
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: "0"
63
+ version:
64
+ requirements: []
65
+
66
+ rubyforge_project:
67
+ rubygems_version: 1.2.0
68
+ signing_key:
69
+ specification_version: 2
70
+ summary: Map/Reduce
71
+ test_files: []
72
+