mapredus 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,366 @@
1
+ module MapRedus
2
+
3
+ # This is what keeps track of our map reduce processes
4
+ #
5
+ # We use a redis key to identify the id of map reduce process
6
+ # the value of the redis object is a json object which contains:
7
+ #
8
+ # {
9
+ # mapper : mapclass,
10
+ # reducer : reduceclass,
11
+ # finalizer : finalizerclass,
12
+ # partitioner : <not supported>,
13
+ # combiner : <not supported>,
14
+ # ordered : true_or_false ## ensures ordering keys from the map output --> [ order, key, value ],
15
+ # synchronous : true_or_false ## runs the process synchronously or not (generally used for testing)
16
+ # result_timeout : lenght of time a result is saved ## 3600 * 24
17
+ # keyname : the location to the save the result of the process (cache location)
18
+ # state : the current state of the process (shouldn't be set by the process and starts off as nil)
19
+ # }
20
+ #
21
+ # The user has the ability in subclassing this class to create extra features if needed
22
+ #
23
+ class Process
24
+ # Public: Keep track of information that may show up as the redis json value
25
+ # This is so we know exactly what might show up in the json hash
26
+ READERS = [:pid]
27
+ ATTRS = [:inputter, :mapper, :reducer, :finalizer, :outputter, :ordered, :synchronous, :result_timeout, :keyname, :state]
28
+ READERS.each { |r| attr_reader r }
29
+ ATTRS.each { |a| attr_accessor a }
30
+
31
+ DEFAULT_TIME = 3600 * 24
32
+ def initialize(pid, json_info)
33
+ @pid = pid
34
+ read(json_info)
35
+ end
36
+
37
+ def read(json_info)
38
+ @inputter = Helper.class_get(json_helper(json_info, :inputter))
39
+ @mapper = Helper.class_get(json_helper(json_info, :mapper))
40
+ @reducer = Helper.class_get(json_helper(json_info, :reducer))
41
+ @finalizer = Helper.class_get(json_helper(json_info, :finalizer))
42
+ @ordered = json_helper(json_info, :ordered)
43
+ @synchronous = json_helper(json_info, :synchronous)
44
+ @result_timeout = json_helper(json_info, :result_timeout) || DEFAULT_TIME
45
+ @keyname = json_helper(json_info, :keyname)
46
+ @state = json_helper(json_info, :state) || NOT_STARTED
47
+ @outputter = json_helper(json_info, :outputter)
48
+ @outputter = @outputter ? Helper.class_get(@outputter) : MapRedus::Outputter
49
+ end
50
+
51
+ def json_helper(json_info, key)
52
+ json_info[key.to_s] || json_info[key.to_sym]
53
+ end
54
+
55
+ def to_s; to_json; end
56
+
57
+ def to_hash
58
+ (ATTRS + READERS).inject({}) do |h, attr|
59
+ h[attr] = send(attr)
60
+ h
61
+ end
62
+ end
63
+
64
+ def to_json
65
+ Helper.encode(to_hash)
66
+ end
67
+
68
+ def save
69
+ FileSystem.sadd( ProcessInfo.processes, @pid )
70
+ FileSystem.save( ProcessInfo.pid(@pid), to_json )
71
+ self
72
+ end
73
+
74
+ def update(attrs = {})
75
+ attrs.each do |attr, val|
76
+ send("#{attr}=", val)
77
+ end
78
+ save
79
+ end
80
+
81
+ def reload
82
+ read(Helper.decode(FileSystem.get(ProcessInfo.pid(@pid))))
83
+ self
84
+ end
85
+
86
+ # This will not delete if the master is working
87
+ # It can't get ahold of the files to shred while the master is working
88
+ #
89
+ # if safe is set to false, this will delete all the redis stores associated
90
+ # with this process, but will not kill the process from the queue, if it is
91
+ # on the queue. The process operations will fail to work when its data is deleted
92
+ #
93
+ # Examples
94
+ # delete(safe)
95
+ # # => true or false
96
+ #
97
+ # Returns true as long as the master is not working.
98
+ def delete(safe = true)
99
+ return false if (safe && Master.working?(@pid))
100
+ FileSystem.keys("mapredus:process:#{@pid}*").each do |k|
101
+ FileSystem.del(k)
102
+ end
103
+ FileSystem.srem(ProcessInfo.processes, @pid)
104
+ FileSystem.set(ProcessInfo.processes_count, 0) if( 0 == FileSystem.scard(ProcessInfo.processes) )
105
+ true
106
+ end
107
+
108
+ # Iterates through the key, values
109
+ #
110
+ # Example
111
+ # each_key_reduced_value(pid)
112
+ #
113
+ # Returns nothing.
114
+ def each_key_reduced_value
115
+ map_keys.each do |key|
116
+ reduce_values(key).each do |value|
117
+ yield key, value
118
+ end
119
+ end
120
+ end
121
+
122
+ # Iterates through the key, values
123
+ #
124
+ # Example
125
+ # each_key_nonreduced_value(pid)
126
+ #
127
+ # Returns nothing.
128
+ def each_key_nonreduced_value
129
+ map_keys.each do |key|
130
+ map_values(key).each do |value|
131
+ yield key, value
132
+ end
133
+ end
134
+ end
135
+
136
+ def run( data_object, synchronous = false )
137
+ update(:synchronous => synchronous)
138
+ Master.mapreduce( self, data_object )
139
+ true
140
+ end
141
+
142
+ # TODO:
143
+ # Should also have some notion of whether the process is completed or not
144
+ # since the master might not be working, but the process is not yet complete
145
+ # so it is still running
146
+ def running?
147
+ Master.working?(@pid)
148
+ end
149
+
150
+ # Change the process state
151
+ # if the process is not running and is not synchronous
152
+ #
153
+ # Examples
154
+ # process.next_state(pid)
155
+ #
156
+ # returns the state that the process switched to (or stays the same)
157
+ def next_state
158
+ if((not running?) and (not @synchronous))
159
+ new_state = STATE_MACHINE[self.state]
160
+ update(:state => new_state)
161
+ method = "enslave_#{new_state}".to_sym
162
+ Master.send(method, self) if( Master.respond_to?(method) )
163
+ new_state
164
+ end
165
+ end
166
+
167
+ ### The following functions deal with keys/values produced during the
168
+ ### running of a process
169
+
170
+ # Emissions, when we get map/reduce results back we emit these
171
+ # to be stored in our file system (redis)
172
+ #
173
+ # key_value - The key, value
174
+ #
175
+ # Examples
176
+ # emit_intermediate(key, value)
177
+ # # =>
178
+ # emit_intermediate(rank, key, value)
179
+ #
180
+ # Returns the true on success.
181
+ def emit_intermediate(*key_value)
182
+ if( not @ordered )
183
+ key, value = key_value
184
+ FileSystem.sadd( ProcessInfo.keys(@pid), key )
185
+ hashed_key = Helper.hash(key)
186
+ FileSystem.rpush( ProcessInfo.map(@pid, hashed_key), value )
187
+ else
188
+ # if there's an order for the process then we should use a zset above
189
+ # ordered process's map emits [rank, key, value]
190
+ #
191
+ rank, key, value = key_value
192
+ FileSystem.zadd( ProcessInfo.keys(@pid), rank, key )
193
+ hashed_key = Helper.hash(key)
194
+ FileSystem.rpush( ProcessInfo.map(@pid, hashed_key), value )
195
+ end
196
+ raise "Key Collision: key:#{key}, #{key.class} => hashed key:#{hashed_key}" if key_collision?(hashed_key, key)
197
+ true
198
+ end
199
+
200
+ def emit(key, reduce_val)
201
+ hashed_key = Helper.hash(key)
202
+ FileSystem.rpush( ProcessInfo.reduce(@pid, hashed_key), reduce_val )
203
+ end
204
+
205
+ def key_collision?(hashed_key, key)
206
+ not ( FileSystem.setnx( ProcessInfo.hash_to_key(@pid, hashed_key), key ) ||
207
+ FileSystem.get( ProcessInfo.hash_to_key(@pid, hashed_key) ) == key.to_s )
208
+ end
209
+
210
+ # Saves the result to the specified keyname, using the specified outputter
211
+ #
212
+ # Example
213
+ # (mapreduce:process:result:KEYNAME)
214
+ # OR
215
+ # process:pid:result
216
+ #
217
+ # The client must ensure the the result will not be affected when to_s is applied
218
+ # since redis stores all values as strings
219
+ #
220
+ # Returns true on success.
221
+ def save_result(result)
222
+ res = @outputter.encode(result)
223
+ FileSystem.save(ProcessInfo.result(@pid), res)
224
+ FileSystem.save(ProcessInfo.result_cache(@keyname), res, @result_timeout) if @keyname
225
+ true
226
+ end
227
+
228
+ def get_saved_result
229
+ @outputter.decode(Process.get_saved_result(@keyname))
230
+ end
231
+
232
+ def delete_saved_result
233
+ Process.delete_saved_result(@keyname)
234
+ end
235
+
236
+ # Keys that the map operation produced
237
+ #
238
+ # Examples
239
+ # map_keys
240
+ # # =>
241
+ #
242
+ # Returns the Keys.
243
+ def map_keys
244
+ if( not @ordered )
245
+ FileSystem.smembers( ProcessInfo.keys(@pid) )
246
+ else
247
+ FileSystem.zrange( ProcessInfo.keys(@pid), 0, -1 )
248
+ end
249
+ end
250
+
251
+ def num_values(key)
252
+ hashed_key = Helper.hash(key)
253
+ FileSystem.llen( ProcessInfo.map(@pid, hashed_key) )
254
+ end
255
+
256
+ # values that the map operation produced, for a key
257
+ #
258
+ # Examples
259
+ # map_values(key)
260
+ # # =>
261
+ #
262
+ # Returns the values.
263
+ def map_values(key)
264
+ hashed_key = Helper.hash(key)
265
+ FileSystem.lrange( ProcessInfo.map(@pid, hashed_key), 0, -1 )
266
+ end
267
+
268
+
269
+ # values that the reduce operation produced, for a key
270
+ #
271
+ # Examples
272
+ # reduce_values(key)
273
+ # # =>
274
+ #
275
+ # Returns the values.
276
+ def reduce_values(key)
277
+ hashed_key = Helper.hash(key)
278
+ FileSystem.lrange( ProcessInfo.reduce(@pid, hashed_key), 0, -1 )
279
+ end
280
+
281
+ # Map and Reduce are strings naming the Mapper and Reducer
282
+ # classes we want to run our map reduce with.
283
+ #
284
+ # For instance
285
+ # Mapper = "Mapper"
286
+ # Reducer = "Reducer"
287
+ #
288
+ # Default finalizer
289
+ # "MapRedus::Finalizer"
290
+ #
291
+ # Returns the new process id.
292
+ def self.create( *args )
293
+ new_pid = get_available_pid
294
+
295
+ spec = specification(*args)
296
+ return nil unless spec
297
+
298
+ Process.new(new_pid, spec).save
299
+ end
300
+
301
+ def self.specification(*args)
302
+ raise ProcessSpecificationError
303
+ end
304
+
305
+ def self.info(pid)
306
+ FileSystem.keys(ProcessInfo.pid(pid) + "*")
307
+ end
308
+
309
+ def self.open(pid)
310
+ spec = Helper.decode( FileSystem.get(ProcessInfo.pid(pid)) )
311
+ spec && Process.new( pid, spec )
312
+ end
313
+
314
+ # Find out what map reduce processes are out there
315
+ #
316
+ # Examples
317
+ # FileSystem::ps
318
+ #
319
+ # Returns a list of the map reduce process ids
320
+ def self.ps
321
+ FileSystem.smembers(ProcessInfo.processes)
322
+ end
323
+
324
+ # Find out what map reduce processes are out there
325
+ #
326
+ # Examples
327
+ # FileSystem::get_available_pid
328
+ #
329
+ # Returns an avilable pid.
330
+ def self.get_available_pid
331
+ FileSystem.incrby(ProcessInfo.processes_count, 1 + rand(20))
332
+ end
333
+
334
+ # Given a result keyname, delete the result
335
+ #
336
+ # Examples
337
+ # Process.delete_saved_result(key)
338
+ def self.delete_saved_result(keyname)
339
+ FileSystem.del( ProcessInfo.result_cache(keyname) )
340
+ end
341
+
342
+ # Remove redis keys associated with this process if the Master isn't working.
343
+ #
344
+ # potentially is very expensive.
345
+ #
346
+ # Example
347
+ # Process::kill(pid)
348
+ # # => true
349
+ #
350
+ # Returns true on success.
351
+ def self.kill(pid)
352
+ num_killed = Master.emancipate(pid)
353
+ proc = Process.open(pid)
354
+ proc.delete if proc
355
+ num_killed
356
+ end
357
+
358
+ def self.kill_all
359
+ ps.each do |pid|
360
+ kill(pid)
361
+ end
362
+ FileSystem.del(ProcessInfo.processes)
363
+ FileSystem.del(ProcessInfo.processes_count)
364
+ end
365
+ end
366
+ end
@@ -0,0 +1,39 @@
1
+ module MapRedus
2
+ # Reduce is a function that takes in "all" the values for a single given key
3
+ # and outputs a list of values or a single value that usually "reduces"
4
+ # the initial given value set.
5
+ #
6
+ # The output of the reduce shall always be
7
+ # reduce(values) = [ reduced value, reduced value, ... ]
8
+ # and it will often only be a single element array
9
+ #
10
+ # The input values and the output values of the reduce will always
11
+ # be a string. As described in the paper, it is up to the client
12
+ # to define how to deal with this restriction.
13
+ #
14
+ class Reducer < QueueProcess
15
+ #
16
+ # After a recoverable fail this describes how much time we shall wait before
17
+ # readding the reducer back on to the queue.
18
+ #
19
+ DEFAULT_WAIT = 10 # seconds
20
+ def self.wait; DEFAULT_WAIT; end
21
+
22
+ def self.reduce(values); raise InvalidReducer; end
23
+
24
+ # Doesn't handle redundant workers and fault tolerance
25
+ #
26
+ # TODO: Resque::AutoRetry might mess this up.
27
+ def self.perform(pid, key)
28
+ process = Process.open(pid)
29
+ reduce(process.map_values(key)) do |reduce_val|
30
+ process.emit( key, reduce_val )
31
+ end
32
+ rescue MapRedus::RecoverableFail
33
+ Master.enslave_later_reduce(process, key)
34
+ ensure
35
+ Master.free_slave(pid)
36
+ process.next_state
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,56 @@
1
+ module MapRedus
2
+ module Support
3
+ class MapRedusRunnerError < StandardError; end
4
+ class DuplicateProcessDefinitionError < MapRedusRunnerError ; end
5
+
6
+ class Runner
7
+ attr_reader :process
8
+ def initialize(class_name)
9
+ @class = class_name
10
+ end
11
+
12
+ def method_missing(method, *args, &block)
13
+ mr_process = "#{@class}_#{method.to_s}"
14
+ if self.respond_to?(mr_process)
15
+ self.send(mr_process, *args, &block)
16
+ else
17
+ super(method, *args, &block)
18
+ end
19
+ end
20
+ end
21
+
22
+ def mapreduce
23
+ @mapreduce_runner ||= Runner.new(self.class.to_s.gsub(/\W/,"_"))
24
+ end
25
+
26
+ module ClassMethods
27
+ def mapreduce_process( process_name, mapredus_process_class, result_store, opts = {})
28
+ runner_self = Runner
29
+ class_name = self.to_s.gsub(/\W/,"_")
30
+
31
+ global_process_name = "#{class_name}_#{process_name.to_s}"
32
+
33
+ if runner_self.methods.include?(global_process_name)
34
+ raise DuplicateProcessDefintionError
35
+ end
36
+
37
+ keyname = "mapredus_key_#{global_process_name}"
38
+ RedisSupport.redis_key( keyname, result_store )
39
+
40
+ runner_self.send( :define_method, global_process_name ) do |data, *var|
41
+ @process = mapredus_process_class.create
42
+ @process.update(:keyname => RedisSupport::Keys.send( keyname, *var ))
43
+ @process.run(data)
44
+ end
45
+
46
+ runner_self.send( :define_method, "#{global_process_name}_result" ) do |*outputter_args|
47
+ @process.outputter.decode(@process.keyname, *outputter_args)
48
+ end
49
+ end
50
+ end
51
+
52
+ def self.included(model)
53
+ model.extend ClassMethods
54
+ end
55
+ end
56
+ end
data/lib/mapredus.rb ADDED
@@ -0,0 +1,106 @@
1
+ require 'redis'
2
+ require 'redis_support'
3
+ require 'resque'
4
+ require 'resque_scheduler'
5
+
6
+ module MapRedus
7
+ include RedisSupport
8
+
9
+ class InvalidProcess < NotImplementedError
10
+ def initialize; super("MapRedus QueueProcess: need to have perform method defined");end
11
+ end
12
+
13
+ class ProcessSpecificationError < InvalidProcess
14
+ def initialize; super("MapRedus Process: need to have the specification defined");end
15
+ end
16
+
17
+ class InvalidMapper < NotImplementedError
18
+ def initialize; super("MapRedus Mapper: need to have map method defined");end
19
+ end
20
+
21
+ class InvalidReducer < NotImplementedError
22
+ def initialize; super("MapRedus Reducer: need to have reduce method defined");end
23
+ end
24
+
25
+ class InvalidInputStream < NotImplementedError
26
+ def initialize; super("MapRedus InputStream: need to have scan method defined");end
27
+ end
28
+
29
+ class InvalidProcess < NotImplementedError
30
+ def initialize; super("MapRedus Process Creation Failed: Specifications were not specified");end
31
+ end
32
+
33
+ class RecoverableFail < StandardError
34
+ def initialize; super("MapRedus Operation Failed: but it is recoverable") ;end
35
+ end
36
+
37
+ # All Queue Processes should have a function called perform
38
+ # ensuring that when the class is put on the resque queue it can perform its work
39
+ #
40
+ # Caution: defines redis, which is also defined in RedisSupport
41
+ #
42
+ class QueueProcess
43
+ def self.queue; :mapredus; end
44
+ def self.perform(*args); raise InvalidProcess; end
45
+ end
46
+
47
+ # TODO: When you send work to a worker using a mapper you define,
48
+ # the worker won't have that class name defined, unless it was started up
49
+ # with the class loaded
50
+ #
51
+ def register_reducer(klass); end;
52
+ def register_mapper(klass); end;
53
+
54
+ class Helper
55
+ # resque helpers defines
56
+ # redis
57
+ # encode
58
+ # decode
59
+ # classify
60
+ # constantize
61
+ #
62
+ # This is extended here because we want to use the encode and decode function
63
+ # when we interact with resque queues
64
+ extend Resque::Helpers
65
+
66
+ # Defines a hash by taking the absolute value of ruby's string
67
+ # hash to rid the dashes since redis keys should not contain any.
68
+ #
69
+ # key - The key to be hashed.
70
+ #
71
+ # Examples
72
+ #
73
+ # Support::hash( key )
74
+ # # => '8dd8hflf8dhod8doh9hef'
75
+ #
76
+ # Returns the hash.
77
+ def self.hash( key )
78
+ key.to_s.hash.abs.to_s(16)
79
+ end
80
+
81
+ # Returns the classname of the namespaced class.
82
+ #
83
+ # The full name of the class.
84
+ #
85
+ # Examples
86
+ #
87
+ # Support::class_get( Super::Long::Namespace::ClassName )
88
+ # # => 'ClassName'
89
+ #
90
+ # Returns the class name.
91
+ def self.class_get(string)
92
+ constantize(string)
93
+ end
94
+ end
95
+ end
96
+
97
+ require 'mapredus/keys'
98
+ require 'mapredus/process'
99
+ require 'mapredus/filesystem'
100
+ require 'mapredus/master'
101
+ require 'mapredus/mapper'
102
+ require 'mapredus/reducer'
103
+ require 'mapredus/finalizer'
104
+ require 'mapredus/support'
105
+ require 'mapredus/outputter'
106
+ require 'mapredus/inputter'
data/spec/helper.rb ADDED
@@ -0,0 +1,47 @@
1
+ require 'rubygems'
2
+ require 'spec'
3
+
4
+ dir = File.dirname(__FILE__)
5
+ $LOAD_PATH.unshift(File.join(dir, '..', 'lib'))
6
+ $LOAD_PATH.unshift(dir)
7
+ require 'mapredus'
8
+
9
+ #
10
+ # make sure we can run redis
11
+ #
12
+ if !system("which redis-server")
13
+ puts '', "** can't find `redis-server` in your path"
14
+ abort ''
15
+ end
16
+
17
+ #
18
+ # start our own redis when the tests start,
19
+ # kill it when they end (redis is run as a daemon)
20
+ #
21
+ puts "Starting redis for testing at localhost:9736..."
22
+ `redis-server #{dir}/redis-test.conf`
23
+
24
+ at_exit do
25
+ #
26
+ # hope that no other processes have redis-test in the name...
27
+ # TODO: fixme
28
+ #
29
+ pid = `ps -A -o pid,command | grep [r]edis-test`.split(" ")[0]
30
+ puts "Killing test redis server..."
31
+ `rm -f #{dir}/dump.rdb`
32
+ Process.kill("KILL", pid.to_i)
33
+ end
34
+
35
+ #
36
+ # Set the redis server
37
+ #
38
+ MapRedus.redis = 'localhost:9736:0'
39
+ Resque.redis = MapRedus.redis
40
+ require 'resque/failure/redis'
41
+ Resque::Failure.backend = Resque::Failure::Redis
42
+
43
+ require 'helper_classes'
44
+
45
+ def work_off
46
+ Resque::Worker.new("*").work(0)
47
+ end