mandy 0.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ require 'rubygems'
2
+ require "rake"
3
+ require File.expand_path(File.join(File.dirname(__FILE__), 'lib', 'mandy'))
4
+ require 'spec/rake/spectask'
5
+
6
+ task :default => :spec
7
+
8
+ Spec::Rake::SpecTask.new(:spec) do |t|
9
+ t.spec_files = FileList['spec/lib/**/*_spec.rb']
10
+ t.spec_opts = %w{-f s -c -L mtime}
11
+ end
12
+
13
+ task :gem do
14
+ `sudo gem build mandy.gemspec`
15
+ `mkdir pkg; mv mandy-*.gem pkg/`
16
+ `sudo gem install pkg/mandy-*.gem`
17
+ end
data/bin/mandy ADDED
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "rubygems"
4
+ require "mandy"
5
+
6
+ puts "\nYou are running Mandy!"
7
+ puts "Here are the commands at your disposal..."
8
+ puts ''
9
+
10
+ {
11
+ 'mandy-install' => 'Installs the Mandy Rubygem on several hosts via ssh.',
12
+ 'mandy-local' => 'Run a Map/Reduce task locally without requiring hadoop',
13
+ 'mandy-hadoop' => 'Run a Map/Reduce task on hadoop using the provided cluster config',
14
+ 'mandy-rm' => 'remove a file or directory from HDFS',
15
+ 'mandy-put' => 'upload a file into HDFS',
16
+ 'mandy-map' => 'Run a map task reading on STDIN and writing to STDOUT',
17
+ 'mandy-reduce' => 'Run a reduce task reading on STDIN and writing to STDOUT'
18
+ }.each do |command, description|
19
+
20
+ puts "#{command.ljust(15)} #{description}"
21
+ end
data/bin/mandy-get ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+
7
+ OptionParser.new do |opts|
8
+ opts.banner = "USAGE: mandy-get hdfs_file_location local_file_destination [options]"
9
+
10
+ opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file.") do |config|
11
+ options.config = config
12
+ end
13
+
14
+ opts.on_tail("-h", "--help", "Show this message") do
15
+ puts opts
16
+ exit
17
+ end
18
+ end.parse!
19
+
20
+
21
+ def absolute_path(path)
22
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
23
+ end
24
+
25
+ remote_file = ARGV[0]
26
+ local_file = ARGV[1]
27
+ config = absolute_path(options.config || 'cluster.xml')
28
+
29
+ `$HADOOP_HOME/bin/hadoop fs -conf #{config} -getmerge #{remote_file} #{local_file}`
data/bin/mandy-hadoop ADDED
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require "mandy"
4
+ require 'optparse'
5
+ require 'ostruct'
6
+ require 'uri'
7
+
8
+ options = OpenStruct.new
9
+
10
+ OptionParser.new do |opts|
11
+ opts.banner = "USAGE: mandy-hadoop script input output [options]"
12
+
13
+ opts.on("-p", "--payload PAYLOAD", "Add a working directory to be sent to the cluster.") do |payload|
14
+ options.payload = payload
15
+ end
16
+
17
+ opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file.") do |config|
18
+ options.config = config
19
+ end
20
+
21
+ opts.on("-v", '--variables name=value', "Pass additional parameters to jobs") do |config|
22
+ options.cmdenv = config
23
+ end
24
+
25
+ opts.on("-j", '--json {"key":"1 value"}', "Pass JSON encoded parameters to jobs") do |config|
26
+ options.cmdenv = "json=#{URI.encode(config)}"
27
+ end
28
+
29
+ opts.on_tail("-h", "--help", "Show this message") do
30
+ puts opts
31
+ exit
32
+ end
33
+ end.parse!
34
+
35
+ def absolute_path(path)
36
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
37
+ end
38
+
39
+ file = ARGV[0]
40
+ filename = File.basename(file)
41
+ input = ARGV[1]
42
+ output_folder = ARGV[2]
43
+ config = options.config || 'cluster.xml'
44
+ payload = options.payload ? Mandy::Packer.pack(options.payload) : ARGV[0]
45
+ cmdenv = options.cmdenv
46
+
47
+ at_exit { Mandy::Packer.cleanup!(payload) }
48
+
49
+ require absolute_path(file)
50
+
51
+ output = nil
52
+
53
+ Mandy::Job.jobs.each_with_index do |job, i|
54
+
55
+ jobconf = job.settings.map { |key, value| %(-D #{key}='#{value}') }.join(' ')
56
+ output = File.join(output_folder, "#{i+1}-#{job.name.downcase.gsub(/\W/, '-')}")
57
+
58
+ command = %($HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-*-streaming.jar #{jobconf}\
59
+ -conf '#{config}' \
60
+ -input "#{input}" \
61
+ -mapper "mandy-map #{filename} '#{job.name}' #{File.basename(payload)}" \
62
+ -reducer "mandy-reduce #{filename} '#{job.name}' #{File.basename(payload)}" \
63
+ -file "#{payload}" \
64
+ -cmdenv #{cmdenv} \
65
+ -output "#{output}")
66
+
67
+ `#{command}`
68
+
69
+ # puts "#{command}"
70
+ input = output
71
+ end
72
+
73
+ # print out the output location so caller can know where to get the results from
74
+ puts output
data/bin/mandy-install ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ if ARGV.size==0
4
+ puts "USAGE: mandy-install user@server1.com [user@server2.com ...]"
5
+ exit
6
+ end
7
+
8
+ ARGV.each do |server|
9
+ puts "Installing on #{server}..."
10
+ system %(ssh #{server} "sudo gem install trafficbroker-mandy --source http://gems.github.com")
11
+ end
data/bin/mandy-local ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ if ARGV.size==0
4
+ puts "USAGE: mandy-local my_script.rb local_input_file local_output_folder"
5
+ exit
6
+ end
7
+
8
+ require "fileutils"
9
+
10
+ def absolute_path(path)
11
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
12
+ end
13
+
14
+ file = absolute_path(ARGV[0])
15
+ input = absolute_path(ARGV[1])
16
+ output_folder = FileUtils.mkdir_p(absolute_path(ARGV[2]))
17
+ require file
18
+
19
+ out = nil
20
+ Mandy::Job.jobs.each_with_index do |job, i|
21
+ out = File.join(output_folder, "#{i+1}-#{job.name.downcase.gsub(/\W/, '-')}")
22
+ puts "Running #{job.name}..."
23
+ `cat #{input} | mandy-map #{file} "#{job.name}" | sort | mandy-reduce #{file} "#{job.name}" > #{out}`
24
+ input = out
25
+ end
26
+
27
+ puts out
data/bin/mandy-map ADDED
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require "mandy"
4
+
5
+ if ARGV.size==0
6
+ puts "USAGE: mandy-map my_script.rb 'Job Name' [payload]"
7
+ exit
8
+ end
9
+
10
+ if ARGV.size > 2
11
+ payload = ARGV[2]
12
+ Mandy::Packer.unpack(payload)
13
+ end
14
+
15
+ def absolute_path(path)
16
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
17
+ end
18
+
19
+ file = absolute_path(ARGV[0])
20
+ job_name = ARGV[1]
21
+
22
+ require file
23
+
24
+ Mandy::Job.find_by_name(job_name).run_map
data/bin/mandy-put ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+
7
+ OptionParser.new do |opts|
8
+ opts.banner = "USAGE: mandy-put local_file_or_folder hdfs_destination_location [options]"
9
+
10
+ opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file.") do |config|
11
+ options.config = config
12
+ end
13
+
14
+ opts.on_tail("-h", "--help", "Show this message") do
15
+ puts opts
16
+ exit
17
+ end
18
+ end.parse!
19
+
20
+ def absolute_path(path)
21
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
22
+ end
23
+
24
+ source = absolute_path(ARGV[0])
25
+ dest = ARGV[1]
26
+ config = options.config || 'cluster.xml'
27
+
28
+ `$HADOOP_HOME/bin/hadoop fs -conf #{config} -copyFromLocal #{source} #{dest}`
data/bin/mandy-reduce ADDED
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require "mandy"
4
+
5
+ if ARGV.size==0
6
+ puts "USAGE: mandy-reduce my_script.rb 'Job Name' [payload]"
7
+ exit
8
+ end
9
+
10
+ def absolute_path(path)
11
+ path =~ /^\// ? path : File.join(Dir.pwd, path)
12
+ end
13
+
14
+ if ARGV.size > 2
15
+ payload = ARGV[2]
16
+ Mandy::Packer.unpack(payload)
17
+ end
18
+
19
+ file = absolute_path(ARGV[0])
20
+ job_name = ARGV[1]
21
+
22
+ require file
23
+
24
+ Mandy::Job.find_by_name(job_name).run_reduce
data/bin/mandy-rm ADDED
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+
7
+ OptionParser.new do |opts|
8
+ opts.banner = "USAGE: mandy-rm file_or_folder_on_hdfs [options]"
9
+
10
+ opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file.") do |config|
11
+ options.config = config
12
+ end
13
+
14
+ opts.on_tail("-h", "--help", "Show this message") do
15
+ puts opts
16
+ exit
17
+ end
18
+ end.parse!
19
+
20
+ file = ARGV[0]
21
+ config = options.config || 'cluster.xml'
22
+
23
+ `$HADOOP_HOME/bin/hadoop fs -conf #{config} -rmr #{file}`
data/lib/dsl.rb ADDED
@@ -0,0 +1,10 @@
1
+ module Mandy
2
+ module DSL
3
+ def job(name, &blk)
4
+ job = Mandy::Job.new(name)
5
+ job.instance_eval(&blk) unless blk.nil?
6
+ Mandy::Job.jobs << job
7
+ job
8
+ end
9
+ end
10
+ end
data/lib/job.rb ADDED
@@ -0,0 +1,92 @@
1
+ module Mandy
2
+ class Job
3
+ class << self
4
+ def jobs
5
+ @jobs ||= []
6
+ end
7
+
8
+ def find_by_name(name)
9
+ jobs.find {|job| job.name == name }
10
+ end
11
+ end
12
+
13
+ attr_reader :settings
14
+ attr_reader :name
15
+
16
+ def initialize(name, &blk)
17
+ @name = name
18
+ @settings = {}
19
+ @modules = []
20
+ @mapper_class = Mandy::Mappers::PassThroughMapper
21
+ @reducer_class = Mandy::Reducers::PassThroughReducer
22
+ set('mapred.job.name', name)
23
+ instance_eval(&blk) if blk
24
+ end
25
+
26
+ def mixin(*modules)
27
+ modules.each {|m| @modules << m}
28
+ end
29
+ alias_method :serialize, :mixin
30
+
31
+ def input_format(format)
32
+ @input_format = format
33
+ end
34
+
35
+ def output_format(format)
36
+ @output_format = format
37
+ end
38
+
39
+ def set(key, value)
40
+ @settings[key.to_s] = value.to_s
41
+ end
42
+
43
+ def map_tasks(count)
44
+ set('mapred.map.tasks', count)
45
+ end
46
+
47
+ def reduce_tasks(count)
48
+ set('mapred.reduce.tasks', count)
49
+ end
50
+
51
+ def store(type, name, options={})
52
+ Mandy.stores[name] = case type
53
+ when :hbase
54
+ Stores::HBase.new(options)
55
+ else
56
+ raise "Unknown store type #{type}"
57
+ end
58
+ end
59
+
60
+ def map(klass=nil, &blk)
61
+ @mapper_class = klass || Mandy::Mappers::Base.compile(&blk)
62
+ @modules.each {|m| @mapper_class.send(:include, m) }
63
+ @mapper_class
64
+ end
65
+
66
+ def reduce(klass=nil, &blk)
67
+ @reducer_class = klass || Mandy::Reducers::Base.compile(&blk)
68
+ @modules.each {|m| @reducer_class.send(:include, m) }
69
+ @reducer_class
70
+ end
71
+
72
+ def run_map(input=STDIN, output=STDOUT, &blk)
73
+ @mapper_class.send(:include, Mandy::IO::OutputFormatting) unless reducer_defined?
74
+ mapper = @mapper_class.new(input, output, @input_format, @output_format)
75
+ yield(mapper) if blk
76
+ mapper.execute
77
+ end
78
+
79
+ def run_reduce(input=STDIN, output=STDOUT, &blk)
80
+ reducer = @reducer_class.new(input, output, @input_format, @output_format)
81
+ yield(reducer) if blk
82
+ reducer.execute
83
+ end
84
+
85
+ private
86
+
87
+ def reducer_defined?
88
+ @reducer_class != Mandy::Reducers::PassThroughReducer
89
+ end
90
+
91
+ end
92
+ end
data/lib/mandy.rb ADDED
@@ -0,0 +1,35 @@
1
+ require "rubygems"
2
+ require "json"
3
+ require "uri"
4
+
5
+ %w(
6
+ support/formatting
7
+ task
8
+ dsl
9
+ job
10
+ packer
11
+ support/tuple
12
+ support/array_serializer
13
+ mappers/base_mapper
14
+ mappers/transpose_mapper
15
+ mappers/pass_through_mapper
16
+ reducers/base_reducer
17
+ reducers/pass_through_reducer
18
+ reducers/sum_reducer
19
+ reducers/max_reducer
20
+ reducers/min_reducer
21
+ reducers/transpose_reducer
22
+ serializers/json
23
+ stores/hbase
24
+ stores/in_memory
25
+ test_runner
26
+ ruby-hbase
27
+ ).each {|file| require File.join(File.dirname(__FILE__), file) }
28
+
29
+ module Mandy
30
+ class << self
31
+ def stores
32
+ @stores||={}
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,30 @@
1
+ module Mandy
2
+ module Mappers
3
+ class Base < Mandy::Task
4
+ include Mandy::IO::InputFormatting
5
+
6
+ def self.compile(&blk)
7
+ Class.new(Mandy::Mappers::Base) do
8
+ self.class_eval do
9
+ define_method(:mapper, blk) if blk
10
+ end
11
+ end
12
+ end
13
+
14
+ def execute
15
+ @input.each_line do |line|
16
+ key, value = line.split(KEY_VALUE_SEPERATOR, 2)
17
+ key, value = nil, key if value.nil?
18
+ value.chomp!
19
+ mapper(input_deserialize_key(key), input_deserialize_value(value))
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def mapper(key,value)
26
+ #nil
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,10 @@
1
+ module Mandy
2
+ module Mappers
3
+ class PassThroughMapper < Base
4
+ def mapper(key,value)
5
+ # default map is simply a pass-through
6
+ emit(key, value)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module Mandy
2
+ module Mappers
3
+ class TransposeMapper < Base
4
+ def mapper(key,value)
5
+ # default map is simply a pass-through
6
+ emit(value, key)
7
+ end
8
+ end
9
+ end
10
+ end
data/lib/packer.rb ADDED
@@ -0,0 +1,25 @@
1
+ require "fileutils"
2
+
3
+ module Mandy
4
+ class Packer
5
+ TMP_DIR = '/tmp/mandy'
6
+
7
+ def self.pack(dir)
8
+ return dir if File.file?(dir)
9
+ FileUtils.mkdir_p(TMP_DIR)
10
+ tmp_path = "#{TMP_DIR}/packed-job-#{Time.now.to_i}.tar"
11
+ Dir.chdir(dir) { `tar -cf #{tmp_path} *` }
12
+ tmp_path
13
+ end
14
+
15
+ def self.unpack(file)
16
+ return false unless File.extname(file) == '.tar'
17
+ `tar -xf #{file}`
18
+ end
19
+
20
+ def self.cleanup!(file)
21
+ return false unless File.extname(file) == '.tar'
22
+ `rm #{file}`
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,36 @@
1
+ module Mandy
2
+ module Reducers
3
+ class Base < Mandy::Task
4
+ include Mandy::IO::OutputFormatting
5
+
6
+ def self.compile(&blk)
7
+ Class.new(Mandy::Reducers::Base) do
8
+ self.class_eval do
9
+ define_method(:reducer, blk) if blk
10
+ end
11
+ end
12
+ end
13
+
14
+ def execute
15
+ last_key, values = nil, []
16
+ @input.each_line do |line|
17
+ key, value = line.split(KEY_VALUE_SEPERATOR)
18
+ value.chomp!
19
+ last_key = key if last_key.nil?
20
+ if key != last_key
21
+ reducer(last_key, values)
22
+ last_key, values = key, []
23
+ end
24
+ values << value
25
+ end
26
+ reducer(deserialize_key(last_key), values.map {|v| deserialize_value(v) })
27
+ end
28
+
29
+ private
30
+
31
+ def reducer(key,values)
32
+ #nil
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,10 @@
1
+ module Mandy
2
+ module Reducers
3
+ class MaxReducer < Base
4
+ def reducer(key,values)
5
+ values.map! {|value| value.to_f}
6
+ emit(key, values.max)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ module Mandy
2
+ module Reducers
3
+ class MinReducer < Base
4
+ def reducer(key,values)
5
+ values.map! {|value| value.to_f}
6
+ emit(key, values.min)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,9 @@
1
+ module Mandy
2
+ module Reducers
3
+ class PassThroughReducer < Base
4
+ def reducer(key,values)
5
+ values.each {|value| emit(key, value) }
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Mandy
2
+ module Reducers
3
+ class SumReducer < Base
4
+ def reducer(key,values)
5
+ emit(key, values.inject(0) {|sum,count| sum+count.to_f})
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Mandy
2
+ module Reducers
3
+ class TransposeReducer < Base
4
+ def reducer(key,values)
5
+ values.each {|value| emit(value, key) }
6
+ end
7
+ end
8
+ end
9
+ end
data/lib/ruby-hbase.rb ADDED
@@ -0,0 +1,10 @@
1
+ $:.unshift File.dirname(__FILE__)
2
+
3
+ require "rubygems"
4
+ require "net/http"
5
+ require "erb"
6
+ require "xml/libxml"
7
+
8
+ require "ruby-hbase/xml_decoder"
9
+ require "ruby-hbase/hbase_table"
10
+ require "ruby-hbase/scanner"
@@ -0,0 +1,166 @@
1
+ module HBase
2
+ class RowNotFound < Exception
3
+ def initialize(msg=nil)
4
+ super
5
+ end
6
+ end
7
+
8
+ class HTable
9
+ include XmlDecoder
10
+
11
+ def initialize(table_uri)
12
+ @table_uri = table_uri
13
+
14
+ @uri = URI.parse(table_uri)
15
+
16
+ @host, @table_name = @uri.host, @uri.path.split("/").last
17
+ end
18
+
19
+ def name
20
+ @table_name
21
+ end
22
+
23
+ ######################
24
+ # Meta-type requests
25
+
26
+ def start_keys
27
+ raise NotImplementedError
28
+ end
29
+
30
+
31
+ def column_descriptors
32
+ column_families = []
33
+
34
+ # get the xml for the column descriptors
35
+ response = Net::HTTP.get_response(@uri.host, "/api/#{@table_name}", @uri.port)
36
+ body = response.body
37
+
38
+ # parse the xml into a document
39
+ doc = XML::Parser.string(body).parse
40
+
41
+ doc.find("/table/columnfamilies/columnfamily").each do |node|
42
+ colfam = {}
43
+ colfam[:name] = node.find_first("name").content.strip.chop
44
+ column_families << colfam
45
+ end
46
+ column_families
47
+ end
48
+
49
+
50
+ #####################
51
+ # Standard CRUD ops
52
+
53
+ DEFAULT_GET_OPTIONS = {:timestamp => nil, :columns => nil}
54
+
55
+ def get(key, options = {})
56
+ opts = DEFAULT_GET_OPTIONS.merge(options)
57
+
58
+ columns = Array(opts.delete(:columns)).compact
59
+ timestamp = opts.delete(:timestamp)
60
+ timestamp = (timestamp.to_f * 1000).to_i.to_s if timestamp
61
+
62
+ Net::HTTP.start(@uri.host, @uri.port) do |session|
63
+ columns_query = columns.map{ |name| "column=#{name}" }.join("&")
64
+
65
+ ts_section = timestamp ? "/#{timestamp}" : ""
66
+
67
+ query_string = "?" + columns_query
68
+
69
+ query = "/api/#{@table_name}/row/#{url_encode(key)}#{ts_section}#{query_string}"
70
+ response = session.get(query, {"Accept" => "*/*"})
71
+
72
+ case response.code.to_i
73
+ when 200 #success!
74
+ body = response.body
75
+ parse_row_result(body).last
76
+ when 204 #no data - probably an incorrect colname
77
+ raise "Didn't get any data back - check your column names!"
78
+ when 404
79
+ raise RowNotFound, "Could not find row '#{key}'"
80
+ else
81
+ nil
82
+ end
83
+ end
84
+ end
85
+
86
+ def put(key, keys_and_values, timestamp = nil)
87
+ Net::HTTP.start(@uri.host, @uri.port) do |session|
88
+ xml = "<columns>"
89
+
90
+ ts_section = timestamp ? "/#{(timestamp.to_f * 1000).to_i}" : ""
91
+
92
+ keys_and_values.each do |name, value|
93
+ xml << "<column><name>#{name}</name><value>#{[value.to_s].pack("m")}</value></column>"
94
+ end
95
+
96
+ xml << "</columns>"
97
+
98
+ query = "/api/#{@table_name}/row/#{url_encode(key)}#{ts_section}"
99
+ response = session.post(query, xml, {"Content-type" => "text/xml"})
100
+
101
+ case response.code.to_i
102
+ when 200
103
+ true
104
+ else
105
+ unexpected_response(response)
106
+ end
107
+ end
108
+ end
109
+
110
+ def delete(row, columns = nil, timestamp = nil)
111
+ Net::HTTP.start(@uri.host, @uri.port) do |session|
112
+ columns_query = Array(columns).compact.map{ |name| "column=#{name}" }.join("&")
113
+
114
+ response = session.delete("/api/#{@table_name}/row/#{row}?#{columns_query}")
115
+ case response.code.to_i
116
+ when 202
117
+ return true
118
+ else
119
+ unexpected_response(response)
120
+ end
121
+
122
+ end
123
+ end
124
+
125
+ #######################
126
+ # Scanning interface
127
+
128
+ def get_scanner(start_row, end_row, timestamp = nil, columns = nil)
129
+ start_row_query = start_row ? "start_row=#{start_row}" : nil
130
+ end_row_query = end_row ? "end_row=#{end_row}" : nil
131
+ timestamp_section = timestamp ? "/#{(timestamp.to_f * 1000).to_i}" : nil
132
+ columns_section = columns ? columns.map{ |col| "column=#{col}" }.join("&") : nil
133
+
134
+ query_string = [start_row_query, end_row_query,
135
+ timestamp_section, columns_section].compact.join("&")
136
+
137
+ path = ""
138
+
139
+ # open the scanner
140
+ Net::HTTP.start(@uri.host, @uri.port) do |session|
141
+ response = session.post("/api/#{@table_name}/scanner?#{query_string}",
142
+ "", {"Accept" => "text/xml"}
143
+ )
144
+
145
+ case response.code.to_i
146
+ when 201
147
+ # redirect - grab the path and send
148
+ Scanner.new(self, "http://#{@uri.host}:#{@uri.port}" + response["Location"])
149
+ else
150
+ unexpected_response(response)
151
+ end
152
+ end
153
+ end
154
+
155
+
156
+ private
157
+
158
+ def url_encode(str)
159
+ ERB::Util.url_encode(str)
160
+ end
161
+
162
+ def unexpected_response(response)
163
+ raise "Unexpected response code #{response.code.to_i}:\n#{response.body}"
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,55 @@
1
+ module HBase
2
+ class Scanner
3
+ include XmlDecoder
4
+
5
+ def initialize(table, scanner_uri)
6
+ @table, @scanner_uri = table, scanner_uri
7
+ end
8
+
9
+ def close
10
+
11
+ end
12
+
13
+ def next
14
+
15
+ end
16
+
17
+ def each
18
+ parsed_uri = URI.parse(@scanner_uri)
19
+ Net::HTTP.start(parsed_uri.host, parsed_uri.port) do |session|
20
+ while true
21
+ response = session.post(@scanner_uri, "")
22
+
23
+ case response.code.to_i
24
+ when 404
25
+ # over
26
+ break
27
+ when 200
28
+ # item
29
+ yield *parse_row_result(response.body)
30
+ else
31
+ # error
32
+ raise "Unexpected response code #{response.code}, body:\n#{response.body}"
33
+ end
34
+ end
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ # def parse_row(xml)
41
+ # doc = REXML::Document.new(xml)
42
+ #
43
+ # result = {}
44
+ #
45
+ # doc.root.each_element("/row/column") do |column|
46
+ # name = column.get_elements("name")[0].text.strip
47
+ # value = column.get_elements("value")[0].text.strip.unpack("m").first
48
+ # result[name] = value
49
+ # end
50
+ #
51
+ # [doc.root.get_elements("name")[0].text.strip, result]
52
+ # end
53
+
54
+ end
55
+ end
@@ -0,0 +1,9 @@
1
+ module RubyHbase #:nodoc:
2
+ module VERSION #:nodoc:
3
+ MAJOR = 0
4
+ MINOR = 0
5
+ TINY = 4
6
+
7
+ STRING = [MAJOR, MINOR, TINY].join('.')
8
+ end
9
+ end
@@ -0,0 +1,18 @@
1
+ module HBase
2
+ module XmlDecoder
3
+ def parse_row_result(xml)
4
+ doc = XML::Parser.string(xml).parse
5
+
6
+ name_node = doc.root.find_first("/row/name")
7
+ name = name_node ? name_node.content.strip : nil
8
+
9
+ values = {}
10
+
11
+ doc.find("/row/columns/column").each do |node|
12
+ values[node.find_first("name").content.strip.unpack('m').first] = node.find_first("value").content.strip.unpack("m").first
13
+ end
14
+
15
+ [name, values]
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,13 @@
1
+ module Mandy
2
+ module Serializers
3
+ module Json
4
+ def serialize_value(value)
5
+ value.to_json
6
+ end
7
+
8
+ def deserialize_value(value)
9
+ JSON.parse(value)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,24 @@
1
+ module Mandy
2
+ module Stores
3
+ class HBase
4
+ attr_reader :options
5
+
6
+ def initialize(options)
7
+ @options = options
8
+ @table = ::HBase::HTable.new(options[:url])
9
+ end
10
+
11
+ def get(key)
12
+ @table.get(key)
13
+ end
14
+
15
+ def put(key, values)
16
+ @table.put(key, values)
17
+ end
18
+
19
+ def ==(other)
20
+ self.class == other.class && self.options == other.options
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,24 @@
1
+ module Mandy
2
+ module Stores
3
+ class InMemory
4
+ attr_reader :options
5
+
6
+ def initialize(options={})
7
+ @options = options
8
+ @table = {}
9
+ end
10
+
11
+ def get(key)
12
+ @table[key.to_s]
13
+ end
14
+
15
+ def put(key, values)
16
+ @table[key.to_s] = values
17
+ end
18
+
19
+ def ==(other)
20
+ self.class == other.class && self.options == other.options
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,32 @@
1
+ module Mandy
2
+ class ArraySerializer
3
+
4
+ SEPERATOR = '|' unless defined?(SEPERATOR)
5
+
6
+ attr_reader :items
7
+
8
+ def initialize(items)
9
+ @items = items || []
10
+ end
11
+
12
+ def to_s
13
+ @items.join(SEPERATOR)
14
+ end
15
+
16
+ def ==(other)
17
+ (self.class == other.class && self.items == other.items) || (other.is_a?(Array) && self.items == other)
18
+ end
19
+
20
+ def to_a
21
+ @items
22
+ end
23
+
24
+ def self.from_s(str)
25
+ str.split(SEPERATOR)
26
+ end
27
+
28
+ def self.tuples_from_s(str)
29
+ from_s(str).map {|s| Tuple.from_s(s) }
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,27 @@
1
+ module Mandy
2
+ module IO
3
+ module InputFormatting
4
+ def input_deserialize_key(key)
5
+ return key if input_format && input_format == :plain
6
+ deserialize_key(key)
7
+ end
8
+
9
+ def input_deserialize_value(value)
10
+ return value if input_format && input_format == :plain
11
+ deserialize_value(value)
12
+ end
13
+ end
14
+
15
+ module OutputFormatting
16
+ def output_serialize_key(key)
17
+ return key if output_format && output_format == :plain
18
+ serialize_key(key)
19
+ end
20
+
21
+ def output_serialize_value(value)
22
+ return value if output_format && output_format == :plain
23
+ serialize_value(value)
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,40 @@
1
+ module Mandy
2
+ class Tuple
3
+
4
+ SEPERATOR = ',' unless defined?(SEPERATOR)
5
+
6
+ attr_accessor :name, :value
7
+
8
+ def initialize(name, value, name_accessor = nil, value_accessor = nil)
9
+ @name, @value = name, value
10
+ alias_accessor(name_accessor, :name) unless name_accessor.nil?
11
+ alias_accessor(value_accessor, :value) unless value_accessor.nil?
12
+ end
13
+
14
+ def to_s
15
+ %(#{@name}#{SEPERATOR}#{@value})
16
+ end
17
+
18
+ def self.from_s(str)
19
+ parts = str.split(SEPERATOR)
20
+ raise "Can't create tuple from #{str.inspect}. Format should be 'A#{SEPERATOR}B'" unless parts.size==2
21
+ new(*parts)
22
+ end
23
+
24
+ def inspect
25
+ %(<Tuple #{self.to_s}>)
26
+ end
27
+
28
+ def ==(other)
29
+ return false unless self.class == other.class
30
+ self.name == other.name && self.value == other.value
31
+ end
32
+
33
+ private
34
+
35
+ def alias_accessor(new_accessor, old_accessor)
36
+ self.class.send(:alias_method, new_accessor, old_accessor)
37
+ self.class.send(:alias_method, :"#{new_accessor}=", :"#{old_accessor}=")
38
+ end
39
+ end
40
+ end
data/lib/task.rb ADDED
@@ -0,0 +1,83 @@
1
+ module Mandy
2
+ class Task
3
+ JSON_PAYLOAD_KEY = "json"
4
+ KEY_VALUE_SEPERATOR = "\t" unless defined?(KEY_VALUE_SEPERATOR)
5
+ NUMERIC_PADDING = 16
6
+
7
+ attr_reader :input_format, :output_format
8
+
9
+ def initialize(input=STDIN, output=STDOUT, input_format = nil, output_format = nil)
10
+ @input, @output = input, output
11
+ @input_format, @output_format = input_format, output_format
12
+ end
13
+
14
+ def emit(key, value=nil)
15
+ key = 'nil' if key.nil?
16
+ @output.puts(value.nil? ? key.to_s : "#{output_serialize_key(key)}\t#{output_serialize_value(value)}")
17
+ end
18
+
19
+ def get(store, key)
20
+ Mandy.stores[store].get(key)
21
+ end
22
+
23
+ def put(store, key, values)
24
+ Mandy.stores[store].put(key, values)
25
+ end
26
+
27
+ private
28
+ def pad(key)
29
+ key_parts = key.to_s.split(".")
30
+ key_parts[0] = key_parts.first.rjust(NUMERIC_PADDING, '0')
31
+ key_parts.join('.')
32
+ end
33
+
34
+ def update_status(message)
35
+ STDERR.puts("reporter:status:#{message}")
36
+ end
37
+
38
+ def update_counter(group, counter, count)
39
+ STDERR.puts("reporter:counter:#{group},#{counter},#{count}")
40
+ end
41
+
42
+ def parameter(name)
43
+ return find_json_param(name) if json_provided?
44
+ ENV[name.to_s]
45
+ end
46
+
47
+ def find_json_param(name)
48
+ @json_args ||= JSON.parse(URI.decode(ENV[JSON_PAYLOAD_KEY]))
49
+ @json_args[name.to_s]
50
+ end
51
+
52
+ def json_provided?
53
+ !ENV[JSON_PAYLOAD_KEY].nil?
54
+ end
55
+
56
+ def deserialize_key(key)
57
+ key
58
+ end
59
+
60
+ def deserialize_value(value)
61
+ value
62
+ end
63
+
64
+ def serialize_key(key)
65
+ key = pad(key) if key.is_a?(Numeric) && key.to_s.length < NUMERIC_PADDING
66
+ key
67
+ end
68
+
69
+ def serialize_value(value)
70
+ value = ArraySerializer.new(value) if value.is_a?(Array)
71
+ value.to_s
72
+ end
73
+
74
+ def output_serialize_key(key)
75
+ serialize_key(key)
76
+ end
77
+
78
+ def output_serialize_value(value)
79
+ serialize_value(value)
80
+ end
81
+
82
+ end
83
+ end
@@ -0,0 +1,75 @@
1
+ module Mandy
2
+ class TestRunner
3
+ attr_reader :job
4
+
5
+ def initialize(job=Mandy::Job.jobs.first.name, opts={})
6
+ ENV[Mandy::Task::JSON_PAYLOAD_KEY] = opts[:parameters].to_json
7
+ @job = Mandy::Job.find_by_name(job)
8
+ end
9
+
10
+ def map(input_stream, output_stream=StringIO.new(''), &blk)
11
+ input_stream = input_from_array(input_stream) if input_stream.is_a?(Array)
12
+ input_stream = StringIO.new(input_stream) if input_stream.is_a?(String)
13
+ @job.run_map(input_stream, output_stream, &blk)
14
+ output_stream.rewind
15
+ output_stream
16
+ end
17
+
18
+ def reduce(input_stream, output_stream=StringIO.new(''), &blk)
19
+ input_stream = input_from_hash(input_stream) if input_stream.is_a?(Hash)
20
+ input_stream = StringIO.new(input_stream) if input_stream.is_a?(String)
21
+ @job.run_reduce(input_stream, output_stream, &blk)
22
+ output_stream.rewind
23
+ output_stream
24
+ end
25
+
26
+ def self.end_to_end(verbose=false)
27
+ CompositeJobRunner.new(Mandy::Job.jobs,verbose)
28
+ end
29
+
30
+ private
31
+
32
+ def input_from_array(input)
33
+ input.join("\n")
34
+ end
35
+
36
+ def input_from_hash(input)
37
+ output = []
38
+ input.each do |key, values|
39
+ output << "#{key}\t#{values}" and next unless values.is_a?(Array)
40
+ values.each { |value| output << "#{key}\t#{value}" }
41
+ end
42
+ input_from_array(output.sort)
43
+ end
44
+
45
+ class CompositeJobRunner
46
+ def initialize(jobs, verbose=false)
47
+ @jobs = jobs
48
+ @verbose = verbose
49
+ @job_runners = @jobs.map { |job| Mandy::TestRunner.new(job.name) }
50
+ end
51
+
52
+ def execute(input_stream, output_stream=StringIO.new(''))
53
+ map_temp = StringIO.new('')
54
+ reduce_temp = StringIO.new('')
55
+ @job_runners.each_with_index do |runner, index|
56
+ runner.map(input_stream, map_temp)
57
+ if @verbose
58
+ puts "#{runner.job.name} [MAP] #{map_temp.readlines.inspect}"
59
+ map_temp.rewind
60
+ end
61
+ reduce_input = StringIO.new(map_temp.readlines.sort.join(''))
62
+ runner.reduce(reduce_input, (index==@job_runners.size-1 ? output_stream : reduce_temp))
63
+ if @verbose
64
+ puts "#{runner.job.name} [RED] #{reduce_temp.readlines.inspect}"
65
+ reduce_temp.rewind
66
+ end
67
+ input_stream = reduce_temp
68
+ map_temp = StringIO.new('')
69
+ reduce_temp = StringIO.new('')
70
+ end
71
+ output_stream
72
+ end
73
+ end
74
+ end
75
+ end
data/readme.md ADDED
@@ -0,0 +1,11 @@
1
+ Mandy - Simplified Hadoop distribution for Ruby code
2
+ ====================================================
3
+
4
+ Mandy hides the differences and complexities between running map/reduce tasks locally or distributed or in test environments.
5
+
6
+ It provides a simple DSL to define new jobs for distribution. See examples/word_count.rb for a demo of some functionality.
7
+ Run the word count example locally with...
8
+
9
+ mandy-local examples/word_count.rb examples/alice.txt examples/output
10
+
11
+ Mandy is licensed under the MIT Licence, please see LICENCE for further information.
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mandy
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.14
5
+ platform: ruby
6
+ authors:
7
+ - Andy Kent
8
+ - Paul Ingles
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2009-07-09 00:00:00 +01:00
14
+ default_executable:
15
+ dependencies: []
16
+
17
+ description: Map/Reduce
18
+ email: andy.kent@me.com
19
+ executables:
20
+ - mandy
21
+ - mandy-hadoop
22
+ - mandy-local
23
+ - mandy-map
24
+ - mandy-put
25
+ - mandy-get
26
+ - mandy-reduce
27
+ - mandy-rm
28
+ - mandy-install
29
+ extensions: []
30
+
31
+ extra_rdoc_files: []
32
+
33
+ files:
34
+ - bin/mandy-hadoop
35
+ - bin/mandy-local
36
+ - bin/mandy-map
37
+ - bin/mandy-get
38
+ - bin/mandy-put
39
+ - bin/mandy-reduce
40
+ - readme.md
41
+ - Rakefile
42
+ - lib/mandy.rb
43
+ - lib/support/tuple.rb
44
+ - lib/support/formatting.rb
45
+ - lib/support/array_serializer.rb
46
+ - lib/task.rb
47
+ - lib/dsl.rb
48
+ - lib/job.rb
49
+ - lib/mappers/base_mapper.rb
50
+ - lib/mappers/transpose_mapper.rb
51
+ - lib/mappers/pass_through_mapper.rb
52
+ - lib/packer.rb
53
+ - lib/reducers/base_reducer.rb
54
+ - lib/reducers/transpose_reducer.rb
55
+ - lib/reducers/pass_through_reducer.rb
56
+ - lib/reducers/sum_reducer.rb
57
+ - lib/reducers/max_reducer.rb
58
+ - lib/reducers/min_reducer.rb
59
+ - lib/serializers/json.rb
60
+ - lib/stores/hbase.rb
61
+ - lib/stores/in_memory.rb
62
+ - lib/ruby-hbase.rb
63
+ - lib/ruby-hbase/hbase_table.rb
64
+ - lib/ruby-hbase/scanner.rb
65
+ - lib/ruby-hbase/version.rb
66
+ - lib/ruby-hbase/xml_decoder.rb
67
+ - lib/test_runner.rb
68
+ has_rdoc: true
69
+ homepage: http://github.com/trafficbroker/mandy
70
+ licenses: []
71
+
72
+ post_install_message:
73
+ rdoc_options: []
74
+
75
+ require_paths:
76
+ - lib
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: "0"
82
+ version:
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: "0"
88
+ version:
89
+ requirements: []
90
+
91
+ rubyforge_project:
92
+ rubygems_version: 1.3.4
93
+ signing_key:
94
+ specification_version: 2
95
+ summary: Map/Reduce
96
+ test_files: []
97
+