mapredus 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,366 @@
1
+ module MapRedus
2
+
3
+ # This is what keeps track of our map reduce processes
4
+ #
5
+ # We use a redis key to identify the id of map reduce process
6
+ # the value of the redis object is a json object which contains:
7
+ #
8
+ # {
9
+ # mapper : mapclass,
10
+ # reducer : reduceclass,
11
+ # finalizer : finalizerclass,
12
+ # partitioner : <not supported>,
13
+ # combiner : <not supported>,
14
+ # ordered : true_or_false ## ensures ordering keys from the map output --> [ order, key, value ],
15
+ # synchronous : true_or_false ## runs the process synchronously or not (generally used for testing)
16
+ # result_timeout : lenght of time a result is saved ## 3600 * 24
17
+ # keyname : the location to the save the result of the process (cache location)
18
+ # state : the current state of the process (shouldn't be set by the process and starts off as nil)
19
+ # }
20
+ #
21
+ # The user has the ability in subclassing this class to create extra features if needed
22
+ #
23
+ class Process
24
+ # Public: Keep track of information that may show up as the redis json value
25
+ # This is so we know exactly what might show up in the json hash
26
+ READERS = [:pid]
27
+ ATTRS = [:inputter, :mapper, :reducer, :finalizer, :outputter, :ordered, :synchronous, :result_timeout, :keyname, :state]
28
+ READERS.each { |r| attr_reader r }
29
+ ATTRS.each { |a| attr_accessor a }
30
+
31
+ DEFAULT_TIME = 3600 * 24
32
+ def initialize(pid, json_info)
33
+ @pid = pid
34
+ read(json_info)
35
+ end
36
+
37
+ def read(json_info)
38
+ @inputter = Helper.class_get(json_helper(json_info, :inputter))
39
+ @mapper = Helper.class_get(json_helper(json_info, :mapper))
40
+ @reducer = Helper.class_get(json_helper(json_info, :reducer))
41
+ @finalizer = Helper.class_get(json_helper(json_info, :finalizer))
42
+ @ordered = json_helper(json_info, :ordered)
43
+ @synchronous = json_helper(json_info, :synchronous)
44
+ @result_timeout = json_helper(json_info, :result_timeout) || DEFAULT_TIME
45
+ @keyname = json_helper(json_info, :keyname)
46
+ @state = json_helper(json_info, :state) || NOT_STARTED
47
+ @outputter = json_helper(json_info, :outputter)
48
+ @outputter = @outputter ? Helper.class_get(@outputter) : MapRedus::Outputter
49
+ end
50
+
51
+ def json_helper(json_info, key)
52
+ json_info[key.to_s] || json_info[key.to_sym]
53
+ end
54
+
55
+ def to_s; to_json; end
56
+
57
+ def to_hash
58
+ (ATTRS + READERS).inject({}) do |h, attr|
59
+ h[attr] = send(attr)
60
+ h
61
+ end
62
+ end
63
+
64
+ def to_json
65
+ Helper.encode(to_hash)
66
+ end
67
+
68
+ def save
69
+ FileSystem.sadd( ProcessInfo.processes, @pid )
70
+ FileSystem.save( ProcessInfo.pid(@pid), to_json )
71
+ self
72
+ end
73
+
74
+ def update(attrs = {})
75
+ attrs.each do |attr, val|
76
+ send("#{attr}=", val)
77
+ end
78
+ save
79
+ end
80
+
81
+ def reload
82
+ read(Helper.decode(FileSystem.get(ProcessInfo.pid(@pid))))
83
+ self
84
+ end
85
+
86
+ # This will not delete if the master is working
87
+ # It can't get ahold of the files to shred while the master is working
88
+ #
89
+ # if safe is set to false, this will delete all the redis stores associated
90
+ # with this process, but will not kill the process from the queue, if it is
91
+ # on the queue. The process operations will fail to work when its data is deleted
92
+ #
93
+ # Examples
94
+ # delete(safe)
95
+ # # => true or false
96
+ #
97
+ # Returns true as long as the master is not working.
98
+ def delete(safe = true)
99
+ return false if (safe && Master.working?(@pid))
100
+ FileSystem.keys("mapredus:process:#{@pid}*").each do |k|
101
+ FileSystem.del(k)
102
+ end
103
+ FileSystem.srem(ProcessInfo.processes, @pid)
104
+ FileSystem.set(ProcessInfo.processes_count, 0) if( 0 == FileSystem.scard(ProcessInfo.processes) )
105
+ true
106
+ end
107
+
108
+ # Iterates through the key, values
109
+ #
110
+ # Example
111
+ # each_key_reduced_value(pid)
112
+ #
113
+ # Returns nothing.
114
+ def each_key_reduced_value
115
+ map_keys.each do |key|
116
+ reduce_values(key).each do |value|
117
+ yield key, value
118
+ end
119
+ end
120
+ end
121
+
122
+ # Iterates through the key, values
123
+ #
124
+ # Example
125
+ # each_key_nonreduced_value(pid)
126
+ #
127
+ # Returns nothing.
128
+ def each_key_nonreduced_value
129
+ map_keys.each do |key|
130
+ map_values(key).each do |value|
131
+ yield key, value
132
+ end
133
+ end
134
+ end
135
+
136
+ def run( data_object, synchronous = false )
137
+ update(:synchronous => synchronous)
138
+ Master.mapreduce( self, data_object )
139
+ true
140
+ end
141
+
142
+ # TODO:
143
+ # Should also have some notion of whether the process is completed or not
144
+ # since the master might not be working, but the process is not yet complete
145
+ # so it is still running
146
+ def running?
147
+ Master.working?(@pid)
148
+ end
149
+
150
+ # Change the process state
151
+ # if the process is not running and is not synchronous
152
+ #
153
+ # Examples
154
+ # process.next_state(pid)
155
+ #
156
+ # returns the state that the process switched to (or stays the same)
157
+ def next_state
158
+ if((not running?) and (not @synchronous))
159
+ new_state = STATE_MACHINE[self.state]
160
+ update(:state => new_state)
161
+ method = "enslave_#{new_state}".to_sym
162
+ Master.send(method, self) if( Master.respond_to?(method) )
163
+ new_state
164
+ end
165
+ end
166
+
167
+ ### The following functions deal with keys/values produced during the
168
+ ### running of a process
169
+
170
+ # Emissions, when we get map/reduce results back we emit these
171
+ # to be stored in our file system (redis)
172
+ #
173
+ # key_value - The key, value
174
+ #
175
+ # Examples
176
+ # emit_intermediate(key, value)
177
+ # # =>
178
+ # emit_intermediate(rank, key, value)
179
+ #
180
+ # Returns the true on success.
181
+ def emit_intermediate(*key_value)
182
+ if( not @ordered )
183
+ key, value = key_value
184
+ FileSystem.sadd( ProcessInfo.keys(@pid), key )
185
+ hashed_key = Helper.hash(key)
186
+ FileSystem.rpush( ProcessInfo.map(@pid, hashed_key), value )
187
+ else
188
+ # if there's an order for the process then we should use a zset above
189
+ # ordered process's map emits [rank, key, value]
190
+ #
191
+ rank, key, value = key_value
192
+ FileSystem.zadd( ProcessInfo.keys(@pid), rank, key )
193
+ hashed_key = Helper.hash(key)
194
+ FileSystem.rpush( ProcessInfo.map(@pid, hashed_key), value )
195
+ end
196
+ raise "Key Collision: key:#{key}, #{key.class} => hashed key:#{hashed_key}" if key_collision?(hashed_key, key)
197
+ true
198
+ end
199
+
200
+ def emit(key, reduce_val)
201
+ hashed_key = Helper.hash(key)
202
+ FileSystem.rpush( ProcessInfo.reduce(@pid, hashed_key), reduce_val )
203
+ end
204
+
205
+ def key_collision?(hashed_key, key)
206
+ not ( FileSystem.setnx( ProcessInfo.hash_to_key(@pid, hashed_key), key ) ||
207
+ FileSystem.get( ProcessInfo.hash_to_key(@pid, hashed_key) ) == key.to_s )
208
+ end
209
+
210
+ # Saves the result to the specified keyname, using the specified outputter
211
+ #
212
+ # Example
213
+ # (mapreduce:process:result:KEYNAME)
214
+ # OR
215
+ # process:pid:result
216
+ #
217
+ # The client must ensure the the result will not be affected when to_s is applied
218
+ # since redis stores all values as strings
219
+ #
220
+ # Returns true on success.
221
+ def save_result(result)
222
+ res = @outputter.encode(result)
223
+ FileSystem.save(ProcessInfo.result(@pid), res)
224
+ FileSystem.save(ProcessInfo.result_cache(@keyname), res, @result_timeout) if @keyname
225
+ true
226
+ end
227
+
228
+ def get_saved_result
229
+ @outputter.decode(Process.get_saved_result(@keyname))
230
+ end
231
+
232
+ def delete_saved_result
233
+ Process.delete_saved_result(@keyname)
234
+ end
235
+
236
+ # Keys that the map operation produced
237
+ #
238
+ # Examples
239
+ # map_keys
240
+ # # =>
241
+ #
242
+ # Returns the Keys.
243
+ def map_keys
244
+ if( not @ordered )
245
+ FileSystem.smembers( ProcessInfo.keys(@pid) )
246
+ else
247
+ FileSystem.zrange( ProcessInfo.keys(@pid), 0, -1 )
248
+ end
249
+ end
250
+
251
+ def num_values(key)
252
+ hashed_key = Helper.hash(key)
253
+ FileSystem.llen( ProcessInfo.map(@pid, hashed_key) )
254
+ end
255
+
256
+ # values that the map operation produced, for a key
257
+ #
258
+ # Examples
259
+ # map_values(key)
260
+ # # =>
261
+ #
262
+ # Returns the values.
263
+ def map_values(key)
264
+ hashed_key = Helper.hash(key)
265
+ FileSystem.lrange( ProcessInfo.map(@pid, hashed_key), 0, -1 )
266
+ end
267
+
268
+
269
+ # values that the reduce operation produced, for a key
270
+ #
271
+ # Examples
272
+ # reduce_values(key)
273
+ # # =>
274
+ #
275
+ # Returns the values.
276
+ def reduce_values(key)
277
+ hashed_key = Helper.hash(key)
278
+ FileSystem.lrange( ProcessInfo.reduce(@pid, hashed_key), 0, -1 )
279
+ end
280
+
281
+ # Map and Reduce are strings naming the Mapper and Reducer
282
+ # classes we want to run our map reduce with.
283
+ #
284
+ # For instance
285
+ # Mapper = "Mapper"
286
+ # Reducer = "Reducer"
287
+ #
288
+ # Default finalizer
289
+ # "MapRedus::Finalizer"
290
+ #
291
+ # Returns the new process id.
292
+ def self.create( *args )
293
+ new_pid = get_available_pid
294
+
295
+ spec = specification(*args)
296
+ return nil unless spec
297
+
298
+ Process.new(new_pid, spec).save
299
+ end
300
+
301
+ def self.specification(*args)
302
+ raise ProcessSpecificationError
303
+ end
304
+
305
+ def self.info(pid)
306
+ FileSystem.keys(ProcessInfo.pid(pid) + "*")
307
+ end
308
+
309
+ def self.open(pid)
310
+ spec = Helper.decode( FileSystem.get(ProcessInfo.pid(pid)) )
311
+ spec && Process.new( pid, spec )
312
+ end
313
+
314
+ # Find out what map reduce processes are out there
315
+ #
316
+ # Examples
317
+ # FileSystem::ps
318
+ #
319
+ # Returns a list of the map reduce process ids
320
+ def self.ps
321
+ FileSystem.smembers(ProcessInfo.processes)
322
+ end
323
+
324
+ # Find out what map reduce processes are out there
325
+ #
326
+ # Examples
327
+ # FileSystem::get_available_pid
328
+ #
329
+ # Returns an avilable pid.
330
+ def self.get_available_pid
331
+ FileSystem.incrby(ProcessInfo.processes_count, 1 + rand(20))
332
+ end
333
+
334
+ # Given a result keyname, delete the result
335
+ #
336
+ # Examples
337
+ # Process.delete_saved_result(key)
338
+ def self.delete_saved_result(keyname)
339
+ FileSystem.del( ProcessInfo.result_cache(keyname) )
340
+ end
341
+
342
+ # Remove redis keys associated with this process if the Master isn't working.
343
+ #
344
+ # potentially is very expensive.
345
+ #
346
+ # Example
347
+ # Process::kill(pid)
348
+ # # => true
349
+ #
350
+ # Returns true on success.
351
+ def self.kill(pid)
352
+ num_killed = Master.emancipate(pid)
353
+ proc = Process.open(pid)
354
+ proc.delete if proc
355
+ num_killed
356
+ end
357
+
358
+ def self.kill_all
359
+ ps.each do |pid|
360
+ kill(pid)
361
+ end
362
+ FileSystem.del(ProcessInfo.processes)
363
+ FileSystem.del(ProcessInfo.processes_count)
364
+ end
365
+ end
366
+ end
@@ -0,0 +1,39 @@
1
+ module MapRedus
2
+ # Reduce is a function that takes in "all" the values for a single given key
3
+ # and outputs a list of values or a single value that usually "reduces"
4
+ # the initial given value set.
5
+ #
6
+ # The output of the reduce shall always be
7
+ # reduce(values) = [ reduced value, reduced value, ... ]
8
+ # and it will often only be a single element array
9
+ #
10
+ # The input values and the output values of the reduce will always
11
+ # be a string. As described in the paper, it is up to the client
12
+ # to define how to deal with this restriction.
13
+ #
14
+ class Reducer < QueueProcess
15
+ #
16
+ # After a recoverable fail this describes how much time we shall wait before
17
+ # readding the reducer back on to the queue.
18
+ #
19
+ DEFAULT_WAIT = 10 # seconds
20
+ def self.wait; DEFAULT_WAIT; end
21
+
22
+ def self.reduce(values); raise InvalidReducer; end
23
+
24
+ # Doesn't handle redundant workers and fault tolerance
25
+ #
26
+ # TODO: Resque::AutoRetry might mess this up.
27
+ def self.perform(pid, key)
28
+ process = Process.open(pid)
29
+ reduce(process.map_values(key)) do |reduce_val|
30
+ process.emit( key, reduce_val )
31
+ end
32
+ rescue MapRedus::RecoverableFail
33
+ Master.enslave_later_reduce(process, key)
34
+ ensure
35
+ Master.free_slave(pid)
36
+ process.next_state
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,56 @@
1
+ module MapRedus
2
+ module Support
3
+ class MapRedusRunnerError < StandardError; end
4
+ class DuplicateProcessDefinitionError < MapRedusRunnerError ; end
5
+
6
+ class Runner
7
+ attr_reader :process
8
+ def initialize(class_name)
9
+ @class = class_name
10
+ end
11
+
12
+ def method_missing(method, *args, &block)
13
+ mr_process = "#{@class}_#{method.to_s}"
14
+ if self.respond_to?(mr_process)
15
+ self.send(mr_process, *args, &block)
16
+ else
17
+ super(method, *args, &block)
18
+ end
19
+ end
20
+ end
21
+
22
+ def mapreduce
23
+ @mapreduce_runner ||= Runner.new(self.class.to_s.gsub(/\W/,"_"))
24
+ end
25
+
26
+ module ClassMethods
27
+ def mapreduce_process( process_name, mapredus_process_class, result_store, opts = {})
28
+ runner_self = Runner
29
+ class_name = self.to_s.gsub(/\W/,"_")
30
+
31
+ global_process_name = "#{class_name}_#{process_name.to_s}"
32
+
33
+ if runner_self.methods.include?(global_process_name)
34
+ raise DuplicateProcessDefintionError
35
+ end
36
+
37
+ keyname = "mapredus_key_#{global_process_name}"
38
+ RedisSupport.redis_key( keyname, result_store )
39
+
40
+ runner_self.send( :define_method, global_process_name ) do |data, *var|
41
+ @process = mapredus_process_class.create
42
+ @process.update(:keyname => RedisSupport::Keys.send( keyname, *var ))
43
+ @process.run(data)
44
+ end
45
+
46
+ runner_self.send( :define_method, "#{global_process_name}_result" ) do |*outputter_args|
47
+ @process.outputter.decode(@process.keyname, *outputter_args)
48
+ end
49
+ end
50
+ end
51
+
52
+ def self.included(model)
53
+ model.extend ClassMethods
54
+ end
55
+ end
56
+ end
data/lib/mapredus.rb ADDED
@@ -0,0 +1,106 @@
1
+ require 'redis'
2
+ require 'redis_support'
3
+ require 'resque'
4
+ require 'resque_scheduler'
5
+
6
+ module MapRedus
7
+ include RedisSupport
8
+
9
+ class InvalidProcess < NotImplementedError
10
+ def initialize; super("MapRedus QueueProcess: need to have perform method defined");end
11
+ end
12
+
13
+ class ProcessSpecificationError < InvalidProcess
14
+ def initialize; super("MapRedus Process: need to have the specification defined");end
15
+ end
16
+
17
+ class InvalidMapper < NotImplementedError
18
+ def initialize; super("MapRedus Mapper: need to have map method defined");end
19
+ end
20
+
21
+ class InvalidReducer < NotImplementedError
22
+ def initialize; super("MapRedus Reducer: need to have reduce method defined");end
23
+ end
24
+
25
+ class InvalidInputStream < NotImplementedError
26
+ def initialize; super("MapRedus InputStream: need to have scan method defined");end
27
+ end
28
+
29
+ class InvalidProcess < NotImplementedError
30
+ def initialize; super("MapRedus Process Creation Failed: Specifications were not specified");end
31
+ end
32
+
33
+ class RecoverableFail < StandardError
34
+ def initialize; super("MapRedus Operation Failed: but it is recoverable") ;end
35
+ end
36
+
37
+ # All Queue Processes should have a function called perform
38
+ # ensuring that when the class is put on the resque queue it can perform its work
39
+ #
40
+ # Caution: defines redis, which is also defined in RedisSupport
41
+ #
42
+ class QueueProcess
43
+ def self.queue; :mapredus; end
44
+ def self.perform(*args); raise InvalidProcess; end
45
+ end
46
+
47
+ # TODO: When you send work to a worker using a mapper you define,
48
+ # the worker won't have that class name defined, unless it was started up
49
+ # with the class loaded
50
+ #
51
+ def register_reducer(klass); end;
52
+ def register_mapper(klass); end;
53
+
54
+ class Helper
55
+ # resque helpers defines
56
+ # redis
57
+ # encode
58
+ # decode
59
+ # classify
60
+ # constantize
61
+ #
62
+ # This is extended here because we want to use the encode and decode function
63
+ # when we interact with resque queues
64
+ extend Resque::Helpers
65
+
66
+ # Defines a hash by taking the absolute value of ruby's string
67
+ # hash to rid the dashes since redis keys should not contain any.
68
+ #
69
+ # key - The key to be hashed.
70
+ #
71
+ # Examples
72
+ #
73
+ # Support::hash( key )
74
+ # # => '8dd8hflf8dhod8doh9hef'
75
+ #
76
+ # Returns the hash.
77
+ def self.hash( key )
78
+ key.to_s.hash.abs.to_s(16)
79
+ end
80
+
81
+ # Returns the classname of the namespaced class.
82
+ #
83
+ # The full name of the class.
84
+ #
85
+ # Examples
86
+ #
87
+ # Support::class_get( Super::Long::Namespace::ClassName )
88
+ # # => 'ClassName'
89
+ #
90
+ # Returns the class name.
91
+ def self.class_get(string)
92
+ constantize(string)
93
+ end
94
+ end
95
+ end
96
+
97
+ require 'mapredus/keys'
98
+ require 'mapredus/process'
99
+ require 'mapredus/filesystem'
100
+ require 'mapredus/master'
101
+ require 'mapredus/mapper'
102
+ require 'mapredus/reducer'
103
+ require 'mapredus/finalizer'
104
+ require 'mapredus/support'
105
+ require 'mapredus/outputter'
106
+ require 'mapredus/inputter'
data/spec/helper.rb ADDED
@@ -0,0 +1,47 @@
1
+ require 'rubygems'
2
+ require 'spec'
3
+
4
+ dir = File.dirname(__FILE__)
5
+ $LOAD_PATH.unshift(File.join(dir, '..', 'lib'))
6
+ $LOAD_PATH.unshift(dir)
7
+ require 'mapredus'
8
+
9
+ #
10
+ # make sure we can run redis
11
+ #
12
+ if !system("which redis-server")
13
+ puts '', "** can't find `redis-server` in your path"
14
+ abort ''
15
+ end
16
+
17
+ #
18
+ # start our own redis when the tests start,
19
+ # kill it when they end (redis is run as a daemon)
20
+ #
21
+ puts "Starting redis for testing at localhost:9736..."
22
+ `redis-server #{dir}/redis-test.conf`
23
+
24
+ at_exit do
25
+ #
26
+ # hope that no other processes have redis-test in the name...
27
+ # TODO: fixme
28
+ #
29
+ pid = `ps -A -o pid,command | grep [r]edis-test`.split(" ")[0]
30
+ puts "Killing test redis server..."
31
+ `rm -f #{dir}/dump.rdb`
32
+ Process.kill("KILL", pid.to_i)
33
+ end
34
+
35
+ #
36
+ # Set the redis server
37
+ #
38
+ MapRedus.redis = 'localhost:9736:0'
39
+ Resque.redis = MapRedus.redis
40
+ require 'resque/failure/redis'
41
+ Resque::Failure.backend = Resque::Failure::Redis
42
+
43
+ require 'helper_classes'
44
+
45
+ def work_off
46
+ Resque::Worker.new("*").work(0)
47
+ end