mandy 0.2.14

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ require 'rubygems'
2
+ require "rake"
3
+ require File.expand_path(File.join(File.dirname(__FILE__), 'lib', 'mandy'))
4
+ require 'spec/rake/spectask'
5
+
6
+ task :default => :spec
7
+
8
+ Spec::Rake::SpecTask.new(:spec) do |t|
9
+ t.spec_files = FileList['spec/lib/**/*_spec.rb']
10
+ t.spec_opts = %w{-f s -c -L mtime}
11
+ end
12
+
13
+ task :gem do
14
+ `sudo gem build mandy.gemspec`
15
+ `mkdir pkg; mv mandy-*.gem pkg/`
16
+ `sudo gem install pkg/mandy-*.gem`
17
+ end
data/bin/mandy ADDED
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "rubygems"
4
+ require "mandy"
5
+
6
+ puts "\nYou are running Mandy!"
7
+ puts "Here are the commands at your disposal..."
8
+ puts ''
9
+
10
+ {
11
+ 'mandy-install' => 'Installs the Mandy Rubygem on several hosts via ssh.',
12
+ 'mandy-local' => 'Run a Map/Reduce task locally without requiring hadoop',
13
+ 'mandy-hadoop' => 'Run a Map/Reduce task on hadoop using the provided cluster config',
14
+ 'mandy-rm' => 'remove a file or directory from HDFS',
15
+ 'mandy-put' => 'upload a file into HDFS',
16
+ 'mandy-map' => 'Run a map task reading on STDIN and writing to STDOUT',
17
+ 'mandy-reduce' => 'Run a reduce task reading on STDIN and writing to STDOUT'
18
+ }.each do |command, description|
19
+
20
+ puts "#{command.ljust(15)} #{description}"
21
+ end
data/bin/mandy-get ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+
7
+ OptionParser.new do |opts|
8
+ opts.banner = "USAGE: mandy-get hdfs_file_location local_file_destination [options]"
9
+
10
+ opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file.") do |config|
11
+ options.config = config
12
+ end
13
+
14
+ opts.on_tail("-h", "--help", "Show this message") do
15
+ puts opts
16
+ exit
17
+ end
18
+ end.parse!
19
+
20
+
21
+ def absolute_path(path)
22
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
23
+ end
24
+
25
+ remote_file = ARGV[0]
26
+ local_file = ARGV[1]
27
+ config = absolute_path(options.config || 'cluster.xml')
28
+
29
+ `$HADOOP_HOME/bin/hadoop fs -conf #{config} -getmerge #{remote_file} #{local_file}`
data/bin/mandy-hadoop ADDED
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require "mandy"
4
+ require 'optparse'
5
+ require 'ostruct'
6
+ require 'uri'
7
+
8
+ options = OpenStruct.new
9
+
10
+ OptionParser.new do |opts|
11
+ opts.banner = "USAGE: mandy-hadoop script input output [options]"
12
+
13
+ opts.on("-p", "--payload PAYLOAD", "Add a working directory to be sent to the cluster.") do |payload|
14
+ options.payload = payload
15
+ end
16
+
17
+ opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file.") do |config|
18
+ options.config = config
19
+ end
20
+
21
+ opts.on("-v", '--variables name=value', "Pass additional parameters to jobs") do |config|
22
+ options.cmdenv = config
23
+ end
24
+
25
+ opts.on("-j", '--json {"key":"1 value"}', "Pass JSON encoded parameters to jobs") do |config|
26
+ options.cmdenv = "json=#{URI.encode(config)}"
27
+ end
28
+
29
+ opts.on_tail("-h", "--help", "Show this message") do
30
+ puts opts
31
+ exit
32
+ end
33
+ end.parse!
34
+
35
+ def absolute_path(path)
36
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
37
+ end
38
+
39
+ file = ARGV[0]
40
+ filename = File.basename(file)
41
+ input = ARGV[1]
42
+ output_folder = ARGV[2]
43
+ config = options.config || 'cluster.xml'
44
+ payload = options.payload ? Mandy::Packer.pack(options.payload) : ARGV[0]
45
+ cmdenv = options.cmdenv
46
+
47
+ at_exit { Mandy::Packer.cleanup!(payload) }
48
+
49
+ require absolute_path(file)
50
+
51
+ output = nil
52
+
53
+ Mandy::Job.jobs.each_with_index do |job, i|
54
+
55
+ jobconf = job.settings.map { |key, value| %(-D #{key}='#{value}') }.join(' ')
56
+ output = File.join(output_folder, "#{i+1}-#{job.name.downcase.gsub(/\W/, '-')}")
57
+
58
+ command = %($HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-*-streaming.jar #{jobconf}\
59
+ -conf '#{config}' \
60
+ -input "#{input}" \
61
+ -mapper "mandy-map #{filename} '#{job.name}' #{File.basename(payload)}" \
62
+ -reducer "mandy-reduce #{filename} '#{job.name}' #{File.basename(payload)}" \
63
+ -file "#{payload}" \
64
+ -cmdenv #{cmdenv} \
65
+ -output "#{output}")
66
+
67
+ `#{command}`
68
+
69
+ # puts "#{command}"
70
+ input = output
71
+ end
72
+
73
+ # print out the output location so caller can know where to get the results from
74
+ puts output
data/bin/mandy-install ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ if ARGV.size==0
4
+ puts "USAGE: mandy-install user@server1.com [user@server2.com ...]"
5
+ exit
6
+ end
7
+
8
+ ARGV.each do |server|
9
+ puts "Installing on #{server}..."
10
+ system %(ssh #{server} "sudo gem install trafficbroker-mandy --source http://gems.github.com")
11
+ end
data/bin/mandy-local ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ if ARGV.size==0
4
+ puts "USAGE: mandy-local my_script.rb local_input_file local_output_folder"
5
+ exit
6
+ end
7
+
8
+ require "fileutils"
9
+
10
+ def absolute_path(path)
11
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
12
+ end
13
+
14
+ file = absolute_path(ARGV[0])
15
+ input = absolute_path(ARGV[1])
16
+ output_folder = FileUtils.mkdir_p(absolute_path(ARGV[2]))
17
+ require file
18
+
19
+ out = nil
20
+ Mandy::Job.jobs.each_with_index do |job, i|
21
+ out = File.join(output_folder, "#{i+1}-#{job.name.downcase.gsub(/\W/, '-')}")
22
+ puts "Running #{job.name}..."
23
+ `cat #{input} | mandy-map #{file} "#{job.name}" | sort | mandy-reduce #{file} "#{job.name}" > #{out}`
24
+ input = out
25
+ end
26
+
27
+ puts out
data/bin/mandy-map ADDED
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require "mandy"
4
+
5
+ if ARGV.size==0
6
+ puts "USAGE: mandy-map my_script.rb 'Job Name' [payload]"
7
+ exit
8
+ end
9
+
10
+ if ARGV.size > 2
11
+ payload = ARGV[2]
12
+ Mandy::Packer.unpack(payload)
13
+ end
14
+
15
+ def absolute_path(path)
16
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
17
+ end
18
+
19
+ file = absolute_path(ARGV[0])
20
+ job_name = ARGV[1]
21
+
22
+ require file
23
+
24
+ Mandy::Job.find_by_name(job_name).run_map
data/bin/mandy-put ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+
7
+ OptionParser.new do |opts|
8
+ opts.banner = "USAGE: mandy-put local_file_or_folder hdfs_destination_location [options]"
9
+
10
+ opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file.") do |config|
11
+ options.config = config
12
+ end
13
+
14
+ opts.on_tail("-h", "--help", "Show this message") do
15
+ puts opts
16
+ exit
17
+ end
18
+ end.parse!
19
+
20
+ def absolute_path(path)
21
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
22
+ end
23
+
24
+ source = absolute_path(ARGV[0])
25
+ dest = ARGV[1]
26
+ config = options.config || 'cluster.xml'
27
+
28
+ `$HADOOP_HOME/bin/hadoop fs -conf #{config} -copyFromLocal #{source} #{dest}`
data/bin/mandy-reduce ADDED
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require "mandy"
4
+
5
+ if ARGV.size==0
6
+ puts "USAGE: mandy-reduce my_script.rb 'Job Name' [payload]"
7
+ exit
8
+ end
9
+
10
+ def absolute_path(path)
11
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
12
+ end
13
+
14
+ if ARGV.size > 2
15
+ payload = ARGV[2]
16
+ Mandy::Packer.unpack(payload)
17
+ end
18
+
19
+ file = absolute_path(ARGV[0])
20
+ job_name = ARGV[1]
21
+
22
+ require file
23
+
24
+ Mandy::Job.find_by_name(job_name).run_reduce
data/bin/mandy-rm ADDED
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+
7
+ OptionParser.new do |opts|
8
+ opts.banner = "USAGE: mandy-rm file_or_folder_on_hdfs [options]"
9
+
10
+ opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file.") do |config|
11
+ options.config = config
12
+ end
13
+
14
+ opts.on_tail("-h", "--help", "Show this message") do
15
+ puts opts
16
+ exit
17
+ end
18
+ end.parse!
19
+
20
+ file = ARGV[0]
21
+ config = options.config || 'cluster.xml'
22
+
23
+ `$HADOOP_HOME/bin/hadoop fs -conf #{config} -rmr #{file}`
data/lib/dsl.rb ADDED
@@ -0,0 +1,10 @@
1
+ module Mandy
2
+ module DSL
3
+ def job(name, &blk)
4
+ job = Mandy::Job.new(name)
5
+ job.instance_eval(&blk) unless blk.nil?
6
+ Mandy::Job.jobs << job
7
+ job
8
+ end
9
+ end
10
+ end
data/lib/job.rb ADDED
@@ -0,0 +1,92 @@
1
+ module Mandy
2
+ class Job
3
+ class << self
4
+ def jobs
5
+ @jobs ||= []
6
+ end
7
+
8
+ def find_by_name(name)
9
+ jobs.find {|job| job.name == name }
10
+ end
11
+ end
12
+
13
+ attr_reader :settings
14
+ attr_reader :name
15
+
16
+ def initialize(name, &blk)
17
+ @name = name
18
+ @settings = {}
19
+ @modules = []
20
+ @mapper_class = Mandy::Mappers::PassThroughMapper
21
+ @reducer_class = Mandy::Reducers::PassThroughReducer
22
+ set('mapred.job.name', name)
23
+ instance_eval(&blk) if blk
24
+ end
25
+
26
+ def mixin(*modules)
27
+ modules.each {|m| @modules << m}
28
+ end
29
+ alias_method :serialize, :mixin
30
+
31
+ def input_format(format)
32
+ @input_format = format
33
+ end
34
+
35
+ def output_format(format)
36
+ @output_format = format
37
+ end
38
+
39
+ def set(key, value)
40
+ @settings[key.to_s] = value.to_s
41
+ end
42
+
43
+ def map_tasks(count)
44
+ set('mapred.map.tasks', count)
45
+ end
46
+
47
+ def reduce_tasks(count)
48
+ set('mapred.reduce.tasks', count)
49
+ end
50
+
51
+ def store(type, name, options={})
52
+ Mandy.stores[name] = case type
53
+ when :hbase
54
+ Stores::HBase.new(options)
55
+ else
56
+ raise "Unknown store type #{type}"
57
+ end
58
+ end
59
+
60
+ def map(klass=nil, &blk)
61
+ @mapper_class = klass || Mandy::Mappers::Base.compile(&blk)
62
+ @modules.each {|m| @mapper_class.send(:include, m) }
63
+ @mapper_class
64
+ end
65
+
66
+ def reduce(klass=nil, &blk)
67
+ @reducer_class = klass || Mandy::Reducers::Base.compile(&blk)
68
+ @modules.each {|m| @reducer_class.send(:include, m) }
69
+ @reducer_class
70
+ end
71
+
72
+ def run_map(input=STDIN, output=STDOUT, &blk)
73
+ @mapper_class.send(:include, Mandy::IO::OutputFormatting) unless reducer_defined?
74
+ mapper = @mapper_class.new(input, output, @input_format, @output_format)
75
+ yield(mapper) if blk
76
+ mapper.execute
77
+ end
78
+
79
+ def run_reduce(input=STDIN, output=STDOUT, &blk)
80
+ reducer = @reducer_class.new(input, output, @input_format, @output_format)
81
+ yield(reducer) if blk
82
+ reducer.execute
83
+ end
84
+
85
+ private
86
+
87
+ def reducer_defined?
88
+ @reducer_class != Mandy::Reducers::PassThroughReducer
89
+ end
90
+
91
+ end
92
+ end
data/lib/mandy.rb ADDED
@@ -0,0 +1,35 @@
1
+ require "rubygems"
2
+ require "json"
3
+ require "uri"
4
+
5
+ %w(
6
+ support/formatting
7
+ task
8
+ dsl
9
+ job
10
+ packer
11
+ support/tuple
12
+ support/array_serializer
13
+ mappers/base_mapper
14
+ mappers/transpose_mapper
15
+ mappers/pass_through_mapper
16
+ reducers/base_reducer
17
+ reducers/pass_through_reducer
18
+ reducers/sum_reducer
19
+ reducers/max_reducer
20
+ reducers/min_reducer
21
+ reducers/transpose_reducer
22
+ serializers/json
23
+ stores/hbase
24
+ stores/in_memory
25
+ test_runner
26
+ ruby-hbase
27
+ ).each {|file| require File.join(File.dirname(__FILE__), file) }
28
+
29
+ module Mandy
30
+ class << self
31
+ def stores
32
+ @stores||={}
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,30 @@
1
+ module Mandy
2
+ module Mappers
3
+ class Base < Mandy::Task
4
+ include Mandy::IO::InputFormatting
5
+
6
+ def self.compile(&blk)
7
+ Class.new(Mandy::Mappers::Base) do
8
+ self.class_eval do
9
+ define_method(:mapper, blk) if blk
10
+ end
11
+ end
12
+ end
13
+
14
+ def execute
15
+ @input.each_line do |line|
16
+ key, value = line.split(KEY_VALUE_SEPERATOR, 2)
17
+ key, value = nil, key if value.nil?
18
+ value.chomp!
19
+ mapper(input_deserialize_key(key), input_deserialize_value(value))
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def mapper(key,value)
26
+ #nil
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,10 @@
1
+ module Mandy
2
+ module Mappers
3
+ class PassThroughMapper < Base
4
+ def mapper(key,value)
5
+ # default map is simply a pass-through
6
+ emit(key, value)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module Mandy
2
+ module Mappers
3
+ class TransposeMapper < Base
4
+ def mapper(key,value)
5
+ # default map is simply a pass-through
6
+ emit(value, key)
7
+ end
8
+ end
9
+ end
10
+ end
data/lib/packer.rb ADDED
@@ -0,0 +1,25 @@
1
+ require "fileutils"
2
+
3
+ module Mandy
4
+ class Packer
5
+ TMP_DIR = '/tmp/mandy'
6
+
7
+ def self.pack(dir)
8
+ return dir if File.file?(dir)
9
+ FileUtils.mkdir_p(TMP_DIR)
10
+ tmp_path = "#{TMP_DIR}/packed-job-#{Time.now.to_i}.tar"
11
+ Dir.chdir(dir) { `tar -cf #{tmp_path} *` }
12
+ tmp_path
13
+ end
14
+
15
+ def self.unpack(file)
16
+ return false unless File.extname(file) == '.tar'
17
+ `tar -xf #{file}`
18
+ end
19
+
20
+ def self.cleanup!(file)
21
+ return false unless File.extname(file) == '.tar'
22
+ `rm #{file}`
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,36 @@
1
+ module Mandy
2
+ module Reducers
3
+ class Base < Mandy::Task
4
+ include Mandy::IO::OutputFormatting
5
+
6
+ def self.compile(&blk)
7
+ Class.new(Mandy::Reducers::Base) do
8
+ self.class_eval do
9
+ define_method(:reducer, blk) if blk
10
+ end
11
+ end
12
+ end
13
+
14
+ def execute
15
+ last_key, values = nil, []
16
+ @input.each_line do |line|
17
+ key, value = line.split(KEY_VALUE_SEPERATOR)
18
+ value.chomp!
19
+ last_key = key if last_key.nil?
20
+ if key != last_key
21
+ reducer(last_key, values)
22
+ last_key, values = key, []
23
+ end
24
+ values << value
25
+ end
26
+ reducer(deserialize_key(last_key), values.map {|v| deserialize_value(v) })
27
+ end
28
+
29
+ private
30
+
31
+ def reducer(key,values)
32
+ #nil
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,10 @@
1
+ module Mandy
2
+ module Reducers
3
+ class MaxReducer < Base
4
+ def reducer(key,values)
5
+ values.map! {|value| value.to_f}
6
+ emit(key, values.max)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module Mandy
2
+ module Reducers
3
+ class MinReducer < Base
4
+ def reducer(key,values)
5
+ values.map! {|value| value.to_f}
6
+ emit(key, values.min)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,9 @@
1
+ module Mandy
2
+ module Reducers
3
+ class PassThroughReducer < Base
4
+ def reducer(key,values)
5
+ values.each {|value| emit(key, value) }
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Mandy
2
+ module Reducers
3
+ class SumReducer < Base
4
+ def reducer(key,values)
5
+ emit(key, values.inject(0) {|sum,count| sum+count.to_f})
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Mandy
2
+ module Reducers
3
+ class TransposeReducer < Base
4
+ def reducer(key,values)
5
+ values.each {|value| emit(value, key) }
6
+ end
7
+ end
8
+ end
9
+ end
data/lib/ruby-hbase.rb ADDED
@@ -0,0 +1,10 @@
1
+ $:.unshift File.dirname(__FILE__)
2
+
3
+ require "rubygems"
4
+ require "net/http"
5
+ require "erb"
6
+ require "xml/libxml"
7
+
8
+ require "ruby-hbase/xml_decoder"
9
+ require "ruby-hbase/hbase_table"
10
+ require "ruby-hbase/scanner"
@@ -0,0 +1,166 @@
1
+ module HBase
2
+ class RowNotFound < Exception
3
+ def initialize(msg=nil)
4
+ super
5
+ end
6
+ end
7
+
8
+ class HTable
9
+ include XmlDecoder
10
+
11
+ def initialize(table_uri)
12
+ @table_uri = table_uri
13
+
14
+ @uri = URI.parse(table_uri)
15
+
16
+ @host, @table_name = @uri.host, @uri.path.split("/").last
17
+ end
18
+
19
+ def name
20
+ @table_name
21
+ end
22
+
23
+ ######################
24
+ # Meta-type requests
25
+
26
+ def start_keys
27
+ raise NotImplementedError
28
+ end
29
+
30
+
31
+ def column_descriptors
32
+ column_families = []
33
+
34
+ # get the xml for the column descriptors
35
+ response = Net::HTTP.get_response(@uri.host, "/api/#{@table_name}", @uri.port)
36
+ body = response.body
37
+
38
+ # parse the xml into a document
39
+ doc = XML::Parser.string(body).parse
40
+
41
+ doc.find("/table/columnfamilies/columnfamily").each do |node|
42
+ colfam = {}
43
+ colfam[:name] = node.find_first("name").content.strip.chop
44
+ column_families << colfam
45
+ end
46
+ column_families
47
+ end
48
+
49
+
50
+ #####################
51
+ # Standard CRUD ops
52
+
53
+ DEFAULT_GET_OPTIONS = {:timestamp => nil, :columns => nil}
54
+
55
+ def get(key, options = {})
56
+ opts = DEFAULT_GET_OPTIONS.merge(options)
57
+
58
+ columns = Array(opts.delete(:columns)).compact
59
+ timestamp = opts.delete(:timestamp)
60
+ timestamp = (timestamp.to_f * 1000).to_i.to_s if timestamp
61
+
62
+ Net::HTTP.start(@uri.host, @uri.port) do |session|
63
+ columns_query = columns.map{ |name| "column=#{name}" }.join("&")
64
+
65
+ ts_section = timestamp ? "/#{timestamp}" : ""
66
+
67
+ query_string = "?" + columns_query
68
+
69
+ query = "/api/#{@table_name}/row/#{url_encode(key)}#{ts_section}#{query_string}"
70
+ response = session.get(query, {"Accept" => "*/*"})
71
+
72
+ case response.code.to_i
73
+ when 200 #success!
74
+ body = response.body
75
+ parse_row_result(body).last
76
+ when 204 #no data - probably an incorrect colname
77
+ raise "Didn't get any data back - check your column names!"
78
+ when 404
79
+ raise RowNotFound, "Could not find row '#{key}'"
80
+ else
81
+ nil
82
+ end
83
+ end
84
+ end
85
+
86
+ def put(key, keys_and_values, timestamp = nil)
87
+ Net::HTTP.start(@uri.host, @uri.port) do |session|
88
+ xml = "<columns>"
89
+
90
+ ts_section = timestamp ? "/#{(timestamp.to_f * 1000).to_i}" : ""
91
+
92
+ keys_and_values.each do |name, value|
93
+ xml << "<column><name>#{name}</name><value>#{[value.to_s].pack("m")}</value></column>"
94
+ end
95
+
96
+ xml << "</columns>"
97
+
98
+ query = "/api/#{@table_name}/row/#{url_encode(key)}#{ts_section}"
99
+ response = session.post(query, xml, {"Content-type" => "text/xml"})
100
+
101
+ case response.code.to_i
102
+ when 200
103
+ true
104
+ else
105
+ unexpected_response(response)
106
+ end
107
+ end
108
+ end
109
+
110
+ def delete(row, columns = nil, timestamp = nil)
111
+ Net::HTTP.start(@uri.host, @uri.port) do |session|
112
+ columns_query = Array(columns).compact.map{ |name| "column=#{name}" }.join("&")
113
+
114
+ response = session.delete("/api/#{@table_name}/row/#{row}?#{columns_query}")
115
+ case response.code.to_i
116
+ when 202
117
+ return true
118
+ else
119
+ unexpected_response(response)
120
+ end
121
+
122
+ end
123
+ end
124
+
125
+ #######################
126
+ # Scanning interface
127
+
128
+ def get_scanner(start_row, end_row, timestamp = nil, columns = nil)
129
+ start_row_query = start_row ? "start_row=#{start_row}" : nil
130
+ end_row_query = end_row ? "end_row=#{end_row}" : nil
131
+ timestamp_section = timestamp ? "/#{(timestamp.to_f * 1000).to_i}" : nil
132
+ columns_section = columns ? columns.map{ |col| "column=#{col}" }.join("&") : nil
133
+
134
+ query_string = [start_row_query, end_row_query,
135
+ timestamp_section, columns_section].compact.join("&")
136
+
137
+ path = ""
138
+
139
+ # open the scanner
140
+ Net::HTTP.start(@uri.host, @uri.port) do |session|
141
+ response = session.post("/api/#{@table_name}/scanner?#{query_string}",
142
+ "", {"Accept" => "text/xml"}
143
+ )
144
+
145
+ case response.code.to_i
146
+ when 201
147
+ # redirect - grab the path and send
148
+ Scanner.new(self, "http://#{@uri.host}:#{@uri.port}" + response["Location"])
149
+ else
150
+ unexpected_response(response)
151
+ end
152
+ end
153
+ end
154
+
155
+
156
+ private
157
+
158
+ def url_encode(str)
159
+ ERB::Util.url_encode(str)
160
+ end
161
+
162
+ def unexpected_response(response)
163
+ raise "Unexpected response code #{response.code.to_i}:\n#{response.body}"
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,55 @@
1
+ module HBase
2
+ class Scanner
3
+ include XmlDecoder
4
+
5
+ def initialize(table, scanner_uri)
6
+ @table, @scanner_uri = table, scanner_uri
7
+ end
8
+
9
+ def close
10
+
11
+ end
12
+
13
+ def next
14
+
15
+ end
16
+
17
+ def each
18
+ parsed_uri = URI.parse(@scanner_uri)
19
+ Net::HTTP.start(parsed_uri.host, parsed_uri.port) do |session|
20
+ while true
21
+ response = session.post(@scanner_uri, "")
22
+
23
+ case response.code.to_i
24
+ when 404
25
+ # over
26
+ break
27
+ when 200
28
+ # item
29
+ yield *parse_row_result(response.body)
30
+ else
31
+ # error
32
+ raise "Unexpected response code #{response.code}, body:\n#{response.body}"
33
+ end
34
+ end
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ # def parse_row(xml)
41
+ # doc = REXML::Document.new(xml)
42
+ #
43
+ # result = {}
44
+ #
45
+ # doc.root.each_element("/row/column") do |column|
46
+ # name = column.get_elements("name")[0].text.strip
47
+ # value = column.get_elements("value")[0].text.strip.unpack("m").first
48
+ # result[name] = value
49
+ # end
50
+ #
51
+ # [doc.root.get_elements("name")[0].text.strip, result]
52
+ # end
53
+
54
+ end
55
+ end
@@ -0,0 +1,9 @@
1
+ module RubyHbase #:nodoc:
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 0
5
+ TINY = 4
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
@@ -0,0 +1,18 @@
1
+ module HBase
2
+ module XmlDecoder
3
+ def parse_row_result(xml)
4
+ doc = XML::Parser.string(xml).parse
5
+
6
+ name_node = doc.root.find_first("/row/name")
7
+ name = name_node ? name_node.content.strip : nil
8
+
9
+ values = {}
10
+
11
+ doc.find("/row/columns/column").each do |node|
12
+ values[node.find_first("name").content.strip.unpack('m').first] = node.find_first("value").content.strip.unpack("m").first
13
+ end
14
+
15
+ [name, values]
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,13 @@
1
+ module Mandy
2
+ module Serializers
3
+ module Json
4
+ def serialize_value(value)
5
+ value.to_json
6
+ end
7
+
8
+ def deserialize_value(value)
9
+ JSON.parse(value)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,24 @@
1
+ module Mandy
2
+ module Stores
3
+ class HBase
4
+ attr_reader :options
5
+
6
+ def initialize(options)
7
+ @options = options
8
+ @table = ::HBase::HTable.new(options[:url])
9
+ end
10
+
11
+ def get(key)
12
+ @table.get(key)
13
+ end
14
+
15
+ def put(key, values)
16
+ @table.put(key, values)
17
+ end
18
+
19
+ def ==(other)
20
+ self.class == other.class && self.options == other.options
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,24 @@
1
+ module Mandy
2
+ module Stores
3
+ class InMemory
4
+ attr_reader :options
5
+
6
+ def initialize(options={})
7
+ @options = options
8
+ @table = {}
9
+ end
10
+
11
+ def get(key)
12
+ @table[key.to_s]
13
+ end
14
+
15
+ def put(key, values)
16
+ @table[key.to_s] = values
17
+ end
18
+
19
+ def ==(other)
20
+ self.class == other.class && self.options == other.options
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,32 @@
1
+ module Mandy
2
+ class ArraySerializer
3
+
4
+ SEPERATOR = '|' unless defined?(SEPERATOR)
5
+
6
+ attr_reader :items
7
+
8
+ def initialize(items)
9
+ @items = items || []
10
+ end
11
+
12
+ def to_s
13
+ @items.join(SEPERATOR)
14
+ end
15
+
16
+ def ==(other)
17
+ (self.class == other.class && self.items == other.items) || (other.is_a?(Array) && self.items == other)
18
+ end
19
+
20
+ def to_a
21
+ @items
22
+ end
23
+
24
+ def self.from_s(str)
25
+ str.split(SEPERATOR)
26
+ end
27
+
28
+ def self.tuples_from_s(str)
29
+ from_s(str).map {|s| Tuple.from_s(s) }
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,27 @@
1
+ module Mandy
2
+ module IO
3
+ module InputFormatting
4
+ def input_deserialize_key(key)
5
+ return key if input_format && input_format == :plain
6
+ deserialize_key(key)
7
+ end
8
+
9
+ def input_deserialize_value(value)
10
+ return value if input_format && input_format == :plain
11
+ deserialize_value(value)
12
+ end
13
+ end
14
+
15
+ module OutputFormatting
16
+ def output_serialize_key(key)
17
+ return key if output_format && output_format == :plain
18
+ serialize_key(key)
19
+ end
20
+
21
+ def output_serialize_value(value)
22
+ return value if output_format && output_format == :plain
23
+ serialize_value(value)
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,40 @@
1
+ module Mandy
2
+ class Tuple
3
+
4
+ SEPERATOR = ',' unless defined?(SEPERATOR)
5
+
6
+ attr_accessor :name, :value
7
+
8
+ def initialize(name, value, name_accessor = nil, value_accessor = nil)
9
+ @name, @value = name, value
10
+ alias_accessor(name_accessor, :name) unless name_accessor.nil?
11
+ alias_accessor(value_accessor, :value) unless value_accessor.nil?
12
+ end
13
+
14
+ def to_s
15
+ %(#{@name}#{SEPERATOR}#{@value})
16
+ end
17
+
18
+ def self.from_s(str)
19
+ parts = str.split(SEPERATOR)
20
+ raise "Can't create tuple from #{str.inspect}. Format should be 'A#{SEPERATOR}B'" unless parts.size==2
21
+ new(*parts)
22
+ end
23
+
24
+ def inspect
25
+ %(<Tuple #{self.to_s}>)
26
+ end
27
+
28
+ def ==(other)
29
+ return false unless self.class == other.class
30
+ self.name == other.name && self.value == other.value
31
+ end
32
+
33
+ private
34
+
35
+ def alias_accessor(new_accessor, old_accessor)
36
+ self.class.send(:alias_method, new_accessor, old_accessor)
37
+ self.class.send(:alias_method, :"#{new_accessor}=", :"#{old_accessor}=")
38
+ end
39
+ end
40
+ end
data/lib/task.rb ADDED
@@ -0,0 +1,83 @@
1
+ module Mandy
2
+ class Task
3
+ JSON_PAYLOAD_KEY = "json"
4
+ KEY_VALUE_SEPERATOR = "\t" unless defined?(KEY_VALUE_SEPERATOR)
5
+ NUMERIC_PADDING = 16
6
+
7
+ attr_reader :input_format, :output_format
8
+
9
+ def initialize(input=STDIN, output=STDOUT, input_format = nil, output_format = nil)
10
+ @input, @output = input, output
11
+ @input_format, @output_format = input_format, output_format
12
+ end
13
+
14
+ def emit(key, value=nil)
15
+ key = 'nil' if key.nil?
16
+ @output.puts(value.nil? ? key.to_s : "#{output_serialize_key(key)}\t#{output_serialize_value(value)}")
17
+ end
18
+
19
+ def get(store, key)
20
+ Mandy.stores[store].get(key)
21
+ end
22
+
23
+ def put(store, key, values)
24
+ Mandy.stores[store].put(key, values)
25
+ end
26
+
27
+ private
28
+ def pad(key)
29
+ key_parts = key.to_s.split(".")
30
+ key_parts[0] = key_parts.first.rjust(NUMERIC_PADDING, '0')
31
+ key_parts.join('.')
32
+ end
33
+
34
+ def update_status(message)
35
+ STDERR.puts("reporter:status:#{message}")
36
+ end
37
+
38
+ def update_counter(group, counter, count)
39
+ STDERR.puts("reporter:counter:#{group},#{counter},#{count}")
40
+ end
41
+
42
+ def parameter(name)
43
+ return find_json_param(name) if json_provided?
44
+ ENV[name.to_s]
45
+ end
46
+
47
+ def find_json_param(name)
48
+ @json_args ||= JSON.parse(URI.decode(ENV[JSON_PAYLOAD_KEY]))
49
+ @json_args[name.to_s]
50
+ end
51
+
52
+ def json_provided?
53
+ !ENV[JSON_PAYLOAD_KEY].nil?
54
+ end
55
+
56
+ def deserialize_key(key)
57
+ key
58
+ end
59
+
60
+ def deserialize_value(value)
61
+ value
62
+ end
63
+
64
+ def serialize_key(key)
65
+ key = pad(key) if key.is_a?(Numeric) && key.to_s.length < NUMERIC_PADDING
66
+ key
67
+ end
68
+
69
+ def serialize_value(value)
70
+ value = ArraySerializer.new(value) if value.is_a?(Array)
71
+ value.to_s
72
+ end
73
+
74
+ def output_serialize_key(key)
75
+ serialize_key(key)
76
+ end
77
+
78
+ def output_serialize_value(value)
79
+ serialize_value(value)
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,75 @@
1
+ module Mandy
2
+ class TestRunner
3
+ attr_reader :job
4
+
5
+ def initialize(job=Mandy::Job.jobs.first.name, opts={})
6
+ ENV[Mandy::Task::JSON_PAYLOAD_KEY] = opts[:parameters].to_json
7
+ @job = Mandy::Job.find_by_name(job)
8
+ end
9
+
10
+ def map(input_stream, output_stream=StringIO.new(''), &blk)
11
+ input_stream = input_from_array(input_stream) if input_stream.is_a?(Array)
12
+ input_stream = StringIO.new(input_stream) if input_stream.is_a?(String)
13
+ @job.run_map(input_stream, output_stream, &blk)
14
+ output_stream.rewind
15
+ output_stream
16
+ end
17
+
18
+ def reduce(input_stream, output_stream=StringIO.new(''), &blk)
19
+ input_stream = input_from_hash(input_stream) if input_stream.is_a?(Hash)
20
+ input_stream = StringIO.new(input_stream) if input_stream.is_a?(String)
21
+ @job.run_reduce(input_stream, output_stream, &blk)
22
+ output_stream.rewind
23
+ output_stream
24
+ end
25
+
26
+ def self.end_to_end(verbose=false)
27
+ CompositeJobRunner.new(Mandy::Job.jobs,verbose)
28
+ end
29
+
30
+ private
31
+
32
+ def input_from_array(input)
33
+ input.join("\n")
34
+ end
35
+
36
+ def input_from_hash(input)
37
+ output = []
38
+ input.each do |key, values|
39
+ output << "#{key}\t#{values}" and next unless values.is_a?(Array)
40
+ values.each { |value| output << "#{key}\t#{value}" }
41
+ end
42
+ input_from_array(output.sort)
43
+ end
44
+
45
+ class CompositeJobRunner
46
+ def initialize(jobs, verbose=false)
47
+ @jobs = jobs
48
+ @verbose = verbose
49
+ @job_runners = @jobs.map { |job| Mandy::TestRunner.new(job.name) }
50
+ end
51
+
52
+ def execute(input_stream, output_stream=StringIO.new(''))
53
+ map_temp = StringIO.new('')
54
+ reduce_temp = StringIO.new('')
55
+ @job_runners.each_with_index do |runner, index|
56
+ runner.map(input_stream, map_temp)
57
+ if @verbose
58
+ puts "#{runner.job.name} [MAP] #{map_temp.readlines.inspect}"
59
+ map_temp.rewind
60
+ end
61
+ reduce_input = StringIO.new(map_temp.readlines.sort.join(''))
62
+ runner.reduce(reduce_input, (index==@job_runners.size-1 ? output_stream : reduce_temp))
63
+ if @verbose
64
+ puts "#{runner.job.name} [RED] #{reduce_temp.readlines.inspect}"
65
+ reduce_temp.rewind
66
+ end
67
+ input_stream = reduce_temp
68
+ map_temp = StringIO.new('')
69
+ reduce_temp = StringIO.new('')
70
+ end
71
+ output_stream
72
+ end
73
+ end
74
+ end
75
+ end
data/readme.md ADDED
@@ -0,0 +1,11 @@
1
+ Mandy - Simplified Hadoop distribution for Ruby code
2
+ ====================================================
3
+
4
+ Mandy hides the differences and complexities between running map/reduce tasks locally or distributed or in test environments.
5
+
6
+ It provides a simple DSL to define new jobs for distribution. See examples/word_count.rb for a demo of some functionality.
7
+ Run the word count example locally with...
8
+
9
+ mandy-local examples/word_count.rb examples/alice.txt examples/output
10
+
11
+ Mandy is licensed under the MIT Licence, please see LICENCE for further information.
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mandy
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.14
5
+ platform: ruby
6
+ authors:
7
+ - Andy Kent
8
+ - Paul Ingles
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-07-09 00:00:00 +01:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description: Map/Reduce
18
+ email: andy.kent@me.com
19
+ executables:
20
+ - mandy
21
+ - mandy-hadoop
22
+ - mandy-local
23
+ - mandy-map
24
+ - mandy-put
25
+ - mandy-get
26
+ - mandy-reduce
27
+ - mandy-rm
28
+ - mandy-install
29
+ extensions: []
30
+
31
+ extra_rdoc_files: []
32
+
33
+ files:
34
+ - bin/mandy-hadoop
35
+ - bin/mandy-local
36
+ - bin/mandy-map
37
+ - bin/mandy-get
38
+ - bin/mandy-put
39
+ - bin/mandy-reduce
40
+ - readme.md
41
+ - Rakefile
42
+ - lib/mandy.rb
43
+ - lib/support/tuple.rb
44
+ - lib/support/formatting.rb
45
+ - lib/support/array_serializer.rb
46
+ - lib/task.rb
47
+ - lib/dsl.rb
48
+ - lib/job.rb
49
+ - lib/mappers/base_mapper.rb
50
+ - lib/mappers/transpose_mapper.rb
51
+ - lib/mappers/pass_through_mapper.rb
52
+ - lib/packer.rb
53
+ - lib/reducers/base_reducer.rb
54
+ - lib/reducers/transpose_reducer.rb
55
+ - lib/reducers/pass_through_reducer.rb
56
+ - lib/reducers/sum_reducer.rb
57
+ - lib/reducers/max_reducer.rb
58
+ - lib/reducers/min_reducer.rb
59
+ - lib/serializers/json.rb
60
+ - lib/stores/hbase.rb
61
+ - lib/stores/in_memory.rb
62
+ - lib/ruby-hbase.rb
63
+ - lib/ruby-hbase/hbase_table.rb
64
+ - lib/ruby-hbase/scanner.rb
65
+ - lib/ruby-hbase/version.rb
66
+ - lib/ruby-hbase/xml_decoder.rb
67
+ - lib/test_runner.rb
68
+ has_rdoc: true
69
+ homepage: http://github.com/trafficbroker/mandy
70
+ licenses: []
71
+
72
+ post_install_message:
73
+ rdoc_options: []
74
+
75
+ require_paths:
76
+ - lib
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: "0"
82
+ version:
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: "0"
88
+ version:
89
+ requirements: []
90
+
91
+ rubyforge_project:
92
+ rubygems_version: 1.3.4
93
+ signing_key:
94
+ specification_version: 2
95
+ summary: Map/Reduce
96
+ test_files: []
97
+