mapredus 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.md +227 -0
- data/lib/mapredus/filesystem.rb +43 -0
- data/lib/mapredus/finalizer.rb +33 -0
- data/lib/mapredus/inputter.rb +31 -0
- data/lib/mapredus/keys.rb +86 -0
- data/lib/mapredus/mapper.rb +27 -0
- data/lib/mapredus/master.rb +182 -0
- data/lib/mapredus/outputter.rb +42 -0
- data/lib/mapredus/process.rb +366 -0
- data/lib/mapredus/reducer.rb +39 -0
- data/lib/mapredus/support.rb +56 -0
- data/lib/mapredus.rb +106 -0
- data/spec/helper.rb +47 -0
- data/spec/helper_classes.rb +102 -0
- data/spec/mapredus_spec.rb +295 -0
- metadata +144 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 Dolores Labs
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,227 @@
|
|
1
|
+
MapRedus
|
2
|
+
=========
|
3
|
+
|
4
|
+
Simple MapReduce type framework using redis and resque.
|
5
|
+
|
6
|
+
Overview
|
7
|
+
--------
|
8
|
+
|
9
|
+
This is an experimental implementation of MapReduce using Ruby for
|
10
|
+
process definition, Resque for work execution, and Redis for data
|
11
|
+
storage.
|
12
|
+
|
13
|
+
Goals:
|
14
|
+
|
15
|
+
* simple M/R-style programming for existing Ruby projects
|
16
|
+
* low cost of entry (no need for a dedicated cluster)
|
17
|
+
|
18
|
+
If you are looking for a high-performance MapReduce implementation
|
19
|
+
that can meet your big data needs, try Hadoop.
|
20
|
+
|
21
|
+
|
22
|
+
Using MapRedus
|
23
|
+
---------------
|
24
|
+
|
25
|
+
MapRedus uses Resque to handle the processes that it runs, and redis
|
26
|
+
to keep a store for the values/data produced.
|
27
|
+
|
28
|
+
Workers for a MapRedus process, are Resque workers. Refer to the
|
29
|
+
Resque worker documentation to see how to load the necessary
|
30
|
+
environment for your worker to be able to run mapreduce processs. An
|
31
|
+
example is also located in the tests.
|
32
|
+
|
33
|
+
### Attaching a mapreduce process to a class
|
34
|
+
Often times you'll want to define a mapreduce process that does
|
35
|
+
operation on data within a class. Here is how this looks. There is
|
36
|
+
also an example of this in the tests.
|
37
|
+
class GetWordCount < MapRedus::Process
|
38
|
+
def self.specification
|
39
|
+
{
|
40
|
+
:inputter => WordStream,
|
41
|
+
:mapper => WordCounter,
|
42
|
+
:reducer => Adder,
|
43
|
+
:finalizer => ToRedisHash,
|
44
|
+
:outputter => MapRedus::RedisHasher,
|
45
|
+
:ordered => false
|
46
|
+
}
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
class Job
|
51
|
+
mapreduce_process :word_count, GetWordCount, "job:store:result"
|
52
|
+
end
|
53
|
+
|
54
|
+
The mapreduce_process needs a name, mapper, reducer, finalizer,
|
55
|
+
outputter, and key to store the result. The operation would then be
|
56
|
+
run on a job calling the following.
|
57
|
+
|
58
|
+
job = Job.new
|
59
|
+
job.mapreduce.word_count( data )
|
60
|
+
|
61
|
+
The data specifies the data on which this operation is to run. We are
|
62
|
+
currently working on a way to allow the result_store_key to change
|
63
|
+
depending on class properties. For instance in the above example, if
|
64
|
+
the Job class had an id attribute, we may want to store the final
|
65
|
+
mapreduce result in "job:store:result:#{id}".
|
66
|
+
|
67
|
+
### Inputters, Mappers, Reducers, Finalizers
|
68
|
+
|
69
|
+
MapRedus needs a input stream, mapper, reducer, finalizer to be
|
70
|
+
defined to run. The input stream defines how a block of your data
|
71
|
+
gets divided so that a mapper can work on a small portion to map. For
|
72
|
+
example:
|
73
|
+
|
74
|
+
class InputStream < MapRedus::InputStream
|
75
|
+
def self.scan(data_object)
|
76
|
+
# your data object is a reference to a block of text in redis
|
77
|
+
text_block = MapRedus.redis.get(data_object)
|
78
|
+
text_block.each_line.each_with_index do |line, i|
|
79
|
+
yield(i, line)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
class Mapper < MapRedus::Mapper
|
85
|
+
def self.map(data_to_map)
|
86
|
+
data_to_map.each do |data|
|
87
|
+
key = data
|
88
|
+
value = 1
|
89
|
+
yield( key, value )
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
In this example, the inputt stream calls yield to output a mapredus
|
95
|
+
file number and a the value that is saved to file (in redis). The
|
96
|
+
mapper's map function calls yield to emit the key value pair for
|
97
|
+
storage in redis. The reducer's reduce function acts similarly.
|
98
|
+
|
99
|
+
The finalizer runs whatever needs to be run when a process completes,
|
100
|
+
an example:
|
101
|
+
|
102
|
+
class Finalizer < MapRedus::Finalizer
|
103
|
+
def self.finalize(process)
|
104
|
+
process.each_key_reduced_value do |key, value|
|
105
|
+
process.outputter.encode(process.keyname, key, value)
|
106
|
+
end
|
107
|
+
...
|
108
|
+
< set off a new mapredus process to use this stored data >
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
The process.keyname refers the final result key that is stored in
|
113
|
+
redis. The outputter is needed to define how exactly that encoding is
|
114
|
+
defined. We provided an outputter that encodes your data into a redis
|
115
|
+
hash.
|
116
|
+
|
117
|
+
class RedisHasher < MapRedus::Outputter
|
118
|
+
def encode(result_key, k, v)
|
119
|
+
MapRedus::FileSystem.hset(result_key, k, v)
|
120
|
+
end
|
121
|
+
|
122
|
+
def decode(result_key, k)
|
123
|
+
MapRedus::FileSystem.hget(result_key, k)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
The default Outputter makes no changes to original result, and tries
|
128
|
+
to store that directly into redis as a string.
|
129
|
+
|
130
|
+
Running Tests
|
131
|
+
-------------
|
132
|
+
|
133
|
+
Run the tests which tests the word counter example and some other
|
134
|
+
tests (you'll need to have bundler installed)
|
135
|
+
rake
|
136
|
+
|
137
|
+
Requirements
|
138
|
+
------------
|
139
|
+
Bundler (this will install all the requirements below)
|
140
|
+
Redis
|
141
|
+
RedisSupport
|
142
|
+
Resque
|
143
|
+
Resque-scheduler
|
144
|
+
|
145
|
+
### Notes
|
146
|
+
Instead of calling "emit_intermediate"/"emit" in your map/reduce
|
147
|
+
to produce a key value pair/value you call yield, which will call
|
148
|
+
emit_intermediate/emit for you. This gives flexibility in using
|
149
|
+
Mapper/Reducer classes especially in testing.
|
150
|
+
|
151
|
+
TODO
|
152
|
+
----
|
153
|
+
not necessarily in the given order
|
154
|
+
|
155
|
+
* if a process fails we do what we are supposed to do i.e. add a
|
156
|
+
failure_hook which does something if your process fails
|
157
|
+
|
158
|
+
* include functionality for a partitioner, input reader, combiner
|
159
|
+
|
160
|
+
* implement this shit (registering of environment shit in resque) so
|
161
|
+
that we can run mapreduce commands from the command line. Defining
|
162
|
+
any arbitrary mapper and reducer.
|
163
|
+
|
164
|
+
* implement redundant workers (workers doing the same work in case one
|
165
|
+
of them fails)
|
166
|
+
|
167
|
+
* if a reducer runs a recoverable fail, then make sure that an attempt
|
168
|
+
to reenslave the worker is delayed by some fixed interval
|
169
|
+
|
170
|
+
* edit emit for when we have multiple workers doing the same reduce
|
171
|
+
(redundant workers for fault tolerance might need to change the
|
172
|
+
rpush to a lock and setting of just a value) even if other workers
|
173
|
+
do work on the same answer, want to make sure that the final reduced
|
174
|
+
thing is the same every time
|
175
|
+
|
176
|
+
* Add fault tolerance, better tracking of which workers fail,
|
177
|
+
especially when we have multiple workers doing the same work
|
178
|
+
... currently is handled by Resque failure auto retry
|
179
|
+
|
180
|
+
* if a perform operation fails then we need to have worker recover
|
181
|
+
|
182
|
+
* make use of finish_metrics somewhere so that we can have statistics
|
183
|
+
on how long map reduce processs take
|
184
|
+
|
185
|
+
* better tracking of work being assigned so we can know when a process is finished
|
186
|
+
or in progress and have a trigger to do things when shit finishes
|
187
|
+
|
188
|
+
in resque there is functionality for an after hook which performs
|
189
|
+
something after your process does it's work
|
190
|
+
|
191
|
+
might also check out the resque-status plugin for a cheap and easy
|
192
|
+
way to plug status and completion-rate into existing resque jobs.
|
193
|
+
|
194
|
+
* ensure reducers only do a fixed amount of work? See section 3.2 of
|
195
|
+
paper. bookkeeping that tells the master when tasks are in-progress
|
196
|
+
or completed. this will be important for better paralleziation of
|
197
|
+
tasks
|
198
|
+
|
199
|
+
* think about the following logic
|
200
|
+
|
201
|
+
if a reducer starts working on a key after all maps have finished
|
202
|
+
then when it is done the work on that key is finished forerver
|
203
|
+
|
204
|
+
this would imply a process finishes when all map tasks have
|
205
|
+
finished and all reduce tasks that start after the map tasks have
|
206
|
+
finished
|
207
|
+
|
208
|
+
if a reducer started before all map tasks were finished, then load
|
209
|
+
its reduced result back onto the value list
|
210
|
+
|
211
|
+
if the reducer started after all map tasks finished, then emit the
|
212
|
+
result
|
213
|
+
|
214
|
+
Note on Patches/Pull Requests
|
215
|
+
-----------------------------
|
216
|
+
|
217
|
+
* Fork the project.
|
218
|
+
* Make your feature addition or bug fix.
|
219
|
+
* Add tests for it. This is important so I don't break it in a
|
220
|
+
future version unintentionally.
|
221
|
+
* Commit, do not mess with rakefile, version, or history. (if you
|
222
|
+
want to have your own version, that is fine but bump version in a
|
223
|
+
commit by itself I can ignore when I pull)
|
224
|
+
* Send me a pull request. Bonus points for topic branches.
|
225
|
+
|
226
|
+
## Copyright
|
227
|
+
Copyright (c) 2010 Dolores Labs. See LICENSE for details.
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module MapRedus
|
2
|
+
# Manages the book keeping of redis keys and redis usage
|
3
|
+
# provides the data storage for process information through redis
|
4
|
+
# All interaction with redis should go through this class
|
5
|
+
#
|
6
|
+
class FileSystem
|
7
|
+
def self.storage
|
8
|
+
MapRedus.redis
|
9
|
+
end
|
10
|
+
|
11
|
+
# Save/Read functions to save/read values for a redis key
|
12
|
+
#
|
13
|
+
# Examples
|
14
|
+
# FileSystem.save( key, value )
|
15
|
+
def self.save(key, value, time = nil)
|
16
|
+
storage.set(key, value)
|
17
|
+
storage.expire(key, time) if time
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.method_missing(method, *args, &block)
|
21
|
+
storage.send(method, *args)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Setup locks on results using RedisSupport lock functionality
|
25
|
+
#
|
26
|
+
# Examples
|
27
|
+
# FileSystem::has_lock?(keyname)
|
28
|
+
# # => true or false
|
29
|
+
#
|
30
|
+
# Returns true if there's a lock
|
31
|
+
def self.has_lock?(keyname)
|
32
|
+
MapRedus.has_redis_lock?( RedisKey.result_cache(keyname) )
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.acquire_lock(keyname)
|
36
|
+
MapRedus.acquire_redis_lock_nonblock( RedisKey.result_cache(keyname), 60 * 60 )
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.release_lock(keyname)
|
40
|
+
MapRedus.release_redis_lock( RedisKey.result_cache(keyname) )
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module MapRedus
|
2
|
+
# Run the stuff you want to run at the end of the process.
|
3
|
+
# Define subclass which defines self.finalize and self.serialize
|
4
|
+
# to do what is needed when you want to get the final output
|
5
|
+
# out of redis and into ruby.
|
6
|
+
#
|
7
|
+
# This is basically the message back to the user program that a
|
8
|
+
# process is completed storing the necessary info.
|
9
|
+
#
|
10
|
+
class Finalizer < QueueProcess
|
11
|
+
|
12
|
+
# The default finalizer is to notify of process completion
|
13
|
+
#
|
14
|
+
# Example
|
15
|
+
# Finalizer::finalize(pid)
|
16
|
+
# # => "MapRedus Process : 111 : has completed"
|
17
|
+
#
|
18
|
+
# Returns a message notification
|
19
|
+
def self.finalize(pid)
|
20
|
+
"MapRedus Process : #{pid} : has completed"
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.perform(pid)
|
24
|
+
process = Process.open(pid)
|
25
|
+
result = finalize(process)
|
26
|
+
Master.finish_metrics(pid)
|
27
|
+
result
|
28
|
+
ensure
|
29
|
+
Master.free_slave(pid)
|
30
|
+
process.next_state
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module MapRedus
|
2
|
+
class InputStream < QueueProcess
|
3
|
+
#
|
4
|
+
# An InputSteam needs to implement a way to scan through the
|
5
|
+
# data_object (the object data that is sent to the MapRedus
|
6
|
+
# process). The scan function implements how the data object is
|
7
|
+
# broken sizable pieces for the mappers to operate on.
|
8
|
+
#
|
9
|
+
# It does this by yielding a <key, map_data> pair. The key
|
10
|
+
# specifies the location storage in redis. map_data is string
|
11
|
+
# data that will be written to the redis.
|
12
|
+
#
|
13
|
+
# Example
|
14
|
+
# scan(data_object) do |key, map_data|
|
15
|
+
# ...
|
16
|
+
# end
|
17
|
+
def self.scan(data_object)
|
18
|
+
raise InvalidInputStream
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.perform(pid, data_object)
|
22
|
+
process = Process.open(pid)
|
23
|
+
scan(data_object) do |key, map_data|
|
24
|
+
FileSystem.hset(ProcessInfo.input(pid), key, map_data)
|
25
|
+
Master.enslave_map(process, key)
|
26
|
+
end
|
27
|
+
ensure
|
28
|
+
Master.free_slave(pid)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module MapRedus
|
2
|
+
RedisKey = MapRedus::Keys
|
3
|
+
ProcessInfo = RedisKey
|
4
|
+
|
5
|
+
#### USED WITHIN process.rb ####
|
6
|
+
|
7
|
+
# Holds the current map reduce processes that are either running or which still have data lying around
|
8
|
+
#
|
9
|
+
redis_key :processes, "mapredus:processes"
|
10
|
+
redis_key :processes_count, "mapredus:processes:count"
|
11
|
+
|
12
|
+
# Holds the information (mapper, reducer, etc.) in json format for a map reduce process with pid PID
|
13
|
+
#
|
14
|
+
redis_key :pid, "mapredus:process:PID"
|
15
|
+
|
16
|
+
# The input blocks broken down by the InputStream
|
17
|
+
redis_key :input, "mapredus:process:PID:input"
|
18
|
+
|
19
|
+
# All the keys that the map produced
|
20
|
+
#
|
21
|
+
redis_key :keys, "mapredus:process:PID:keys"
|
22
|
+
|
23
|
+
# The hashed key to actual string value of key
|
24
|
+
#
|
25
|
+
redis_key :hash_to_key, "mapredus:process:PID:keys:HASHED_KEY" # to ACTUAL KEY
|
26
|
+
|
27
|
+
# The list of values for a given key generated by our map function.
|
28
|
+
# When a reduce is run it takes elements from this key and pushes them to :reduce
|
29
|
+
#
|
30
|
+
# key - list of values
|
31
|
+
#
|
32
|
+
redis_key :map, "mapredus:process:PID:map_key:HASHED_KEY"
|
33
|
+
redis_key :reduce, "mapredus:process:PID:map_key:HASHED_KEY:reduce"
|
34
|
+
|
35
|
+
# Temporary redis space for reduce functions to use
|
36
|
+
#
|
37
|
+
redis_key :temp, "mapredus:process:PID:temp_reduce_key:HASHED_KEY:UNIQUE_REDUCE_HOSTNAME:UNIQUE_REDUCE_PROCESS_ID"
|
38
|
+
|
39
|
+
# If we want to hold on to our final data we have a key to put that data in
|
40
|
+
# In normal map reduce we would just be outputting files
|
41
|
+
#
|
42
|
+
redis_key :result, "mapredus:process:PID:result"
|
43
|
+
redis_key :result_cache, "mapredus:result:KEYNAME"
|
44
|
+
|
45
|
+
|
46
|
+
#### USED WITHIN master.rb ####
|
47
|
+
|
48
|
+
# Keeps track of the current slaves (by appending "1" to a redis list)
|
49
|
+
#
|
50
|
+
# TODO: should append some sort of proper process id so we can explicitly keep track
|
51
|
+
# of processes
|
52
|
+
#
|
53
|
+
redis_key :slaves, "mapredus:process:PID:master:slaves"
|
54
|
+
|
55
|
+
#
|
56
|
+
# Use these constants to keep track of the progress of a process
|
57
|
+
#
|
58
|
+
# Example
|
59
|
+
# state => map_in_progress
|
60
|
+
# reduce_in_progress
|
61
|
+
# finalize_in_progress
|
62
|
+
# complete
|
63
|
+
# failed
|
64
|
+
# not_started
|
65
|
+
#
|
66
|
+
# contained in the ProcessInfo hash (redis_key :state, "mapredus:process:PID:master:state")
|
67
|
+
#
|
68
|
+
NOT_STARTED = "not_started"
|
69
|
+
INPUT_MAP_IN_PROGRESS = "mappers"
|
70
|
+
REDUCE_IN_PROGRESS = "reducers"
|
71
|
+
FINALIZER_IN_PROGRESS = "finalizer"
|
72
|
+
COMPLETE = "complete"
|
73
|
+
FAILED = "failed"
|
74
|
+
STATE_MACHINE = { nil => NOT_STARTED,
|
75
|
+
NOT_STARTED => INPUT_MAP_IN_PROGRESS,
|
76
|
+
INPUT_MAP_IN_PROGRESS => REDUCE_IN_PROGRESS,
|
77
|
+
REDUCE_IN_PROGRESS => FINALIZER_IN_PROGRESS,
|
78
|
+
FINALIZER_IN_PROGRESS => COMPLETE}
|
79
|
+
|
80
|
+
# These keep track of timing information for a map reduce process of pid PID
|
81
|
+
#
|
82
|
+
redis_key :requested_at, "mapredus:process:PID:request_at"
|
83
|
+
redis_key :started_at, "mapredus:process:PID:started_at"
|
84
|
+
redis_key :finished_at, "mapredus:process:PID:finished_at"
|
85
|
+
redis_key :recent_time_to_complete, "mapredus:process:recent_time_to_complete"
|
86
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module MapRedus
|
2
|
+
# Map is a function that takes a data chunk
|
3
|
+
# where each data chunk is a list of pieces of your raw data
|
4
|
+
# and emits a list of key, value pairs.
|
5
|
+
#
|
6
|
+
# The output of the map shall always be
|
7
|
+
# [ [key, value], [key, value], ... ]
|
8
|
+
#
|
9
|
+
# Note: Values must be string, integers, booleans, or floats.
|
10
|
+
# i.e., They must be primitive types since these are the only
|
11
|
+
# types that redis supports and since anything inputted into
|
12
|
+
# redis becomes a string.
|
13
|
+
class Mapper < QueueProcess
|
14
|
+
def self.map(data_chunk); raise InvalidMapper; end
|
15
|
+
|
16
|
+
def self.perform(pid, data_key)
|
17
|
+
process = Process.open(pid)
|
18
|
+
data_chunk = FileSystem.hget(ProcessInfo.input(pid), data_key)
|
19
|
+
map( data_chunk ) do |*key_value|
|
20
|
+
process.emit_intermediate(*key_value)
|
21
|
+
end
|
22
|
+
ensure
|
23
|
+
Master.free_slave(pid)
|
24
|
+
process.next_state
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
module MapRedus
|
2
|
+
# Note: Instead of using Resque directly within the process, we implement
|
3
|
+
# a master interface with Resque
|
4
|
+
#
|
5
|
+
# Does bookkeeping to keep track of how many slaves are doing work. If we have
|
6
|
+
# no slaves doing work for a process then the process is done. While there is work available
|
7
|
+
# the slaves will always be doing work.
|
8
|
+
#
|
9
|
+
class Master < QueueProcess
|
10
|
+
# Check whether there are still workers working on process PID's processes
|
11
|
+
#
|
12
|
+
# In synchronous condition, master is always working since nothing is going to
|
13
|
+
# the queue.
|
14
|
+
def self.working?(pid)
|
15
|
+
0 < FileSystem.llen(ProcessInfo.slaves(pid))
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Master performs the work that it needs to do:
|
20
|
+
# it must free itself as a slave from Resque
|
21
|
+
# enslave mappers
|
22
|
+
#
|
23
|
+
def self.perform( pid, data_object )
|
24
|
+
process = Process.open(pid)
|
25
|
+
enslave_inputter(process, data_object)
|
26
|
+
process.update(:state => INPUT_MAP_IN_PROGRESS)
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# The order of operations that occur in the mapreduce process
|
31
|
+
#
|
32
|
+
# The inputter sets off the mapper processes
|
33
|
+
#
|
34
|
+
def self.mapreduce( process, data_object )
|
35
|
+
start_metrics(process.pid)
|
36
|
+
if process.synchronous
|
37
|
+
process.update(:state => INPUT_MAP_IN_PROGRESS)
|
38
|
+
enslave_inputter(process, data_object)
|
39
|
+
process.update(:state => REDUCE_IN_PROGRESS)
|
40
|
+
enslave_reducers(process)
|
41
|
+
process.update(:state => FINALIZER_IN_PROGRESS)
|
42
|
+
enslave_finalizer(process)
|
43
|
+
else
|
44
|
+
Resque.push(QueueProcess.queue, {:class => MapRedus::Master , :args => [process.pid, data_object]} )
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.enslave_inputter(process, data_object)
|
49
|
+
enslave( process, process.inputter, process.pid, data_object )
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.enslave_reducers( process )
|
53
|
+
process.map_keys.each do |key|
|
54
|
+
enslave_reduce( process, key )
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.enslave_finalizer( process )
|
59
|
+
enslave( process, process.finalizer, process.pid )
|
60
|
+
end
|
61
|
+
|
62
|
+
# Have these to match what the Mapper/Reducer perform function expects to see as arguments
|
63
|
+
#
|
64
|
+
# though instead of process the perform function will receive the pid
|
65
|
+
def self.enslave_map(process, data_chunk)
|
66
|
+
enslave( process, process.mapper, process.pid, data_chunk )
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.enslave_reduce(process, key)
|
70
|
+
enslave( process, process.reducer, process.pid, key )
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.enslave_later_reduce(process, key)
|
74
|
+
enslave_later( process.reducer.wait, process, process.reducer, process.pid, key )
|
75
|
+
end
|
76
|
+
|
77
|
+
# The current default (QUEUE) that we push on to is
|
78
|
+
# :mapredus
|
79
|
+
#
|
80
|
+
def self.enslave( process, klass, *args )
|
81
|
+
FileSystem.rpush(ProcessInfo.slaves(process.pid), 1)
|
82
|
+
|
83
|
+
if( process.synchronous )
|
84
|
+
klass.perform(*args)
|
85
|
+
else
|
86
|
+
Resque.push( klass.queue, { :class => klass.to_s, :args => args } )
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.enslave_later( delay_in_seconds, process, klass, *args)
|
91
|
+
FileSystem.rpush(ProcessInfo.slaves(process.pid), 1)
|
92
|
+
|
93
|
+
if( process.synchronous )
|
94
|
+
klass.perform(*args)
|
95
|
+
else
|
96
|
+
#
|
97
|
+
# TODO: I cannot get enqueue_in to work with my tests
|
98
|
+
# there seems to be a silent failure somewhere
|
99
|
+
# in the tests such that it never calls the function
|
100
|
+
# and the queue gets emptied
|
101
|
+
#
|
102
|
+
# Resque.enqueue_in(delay_in_seconds, klass, *args)
|
103
|
+
|
104
|
+
##
|
105
|
+
## Temporary, immediately just push process back onto the resque queue
|
106
|
+
Resque.push( klass.queue, { :class => klass.to_s, :args => args } )
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.slaves(pid)
|
111
|
+
FileSystem.lrange(ProcessInfo.slaves(pid), 0, -1)
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.free_slave(pid)
|
115
|
+
FileSystem.lpop(ProcessInfo.slaves(pid))
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.emancipate(pid)
|
119
|
+
process = Process.open(pid)
|
120
|
+
return unless process
|
121
|
+
|
122
|
+
# Working on resque directly seems dangerous
|
123
|
+
#
|
124
|
+
# Warning: this is supposed to be used as a debugging operation
|
125
|
+
# and isn't intended for normal use. It is potentially very expensive.
|
126
|
+
#
|
127
|
+
destroyed = 0
|
128
|
+
qs = [queue, process.mapper.queue, process.reducer.queue, process.finalizer.queue].uniq
|
129
|
+
qs.each do |q|
|
130
|
+
q_key = "queue:#{q}"
|
131
|
+
Resque.redis.lrange(q_key, 0, -1).each do | string |
|
132
|
+
json = Helper.decode(string)
|
133
|
+
match = json['class'] == "MapRedus::Master"
|
134
|
+
match |= json['class'] == process.inputter.to_s
|
135
|
+
match |= json['class'] == process.mapper.to_s
|
136
|
+
match |= json['class'] == process.reducer.to_s
|
137
|
+
match |= json['class'] == process.finalizer.to_s
|
138
|
+
match &= json['args'].first.to_s == process.pid.to_s
|
139
|
+
if match
|
140
|
+
destroyed += Resque.redis.lrem(q_key, 0, string).to_i
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
#
|
146
|
+
# our slave information is kept track of on file and not in Resque
|
147
|
+
#
|
148
|
+
FileSystem.del(ProcessInfo.slaves(pid))
|
149
|
+
destroyed
|
150
|
+
end
|
151
|
+
|
152
|
+
# Time metrics for measuring how long it takes map reduce to do a process
|
153
|
+
#
|
154
|
+
def self.set_request_time(pid)
|
155
|
+
FileSystem.set( ProcessInfo.requested_at(pid), Time.now.to_i )
|
156
|
+
end
|
157
|
+
|
158
|
+
def self.start_metrics(pid)
|
159
|
+
started = ProcessInfo.started_at( pid )
|
160
|
+
FileSystem.set started, Time.now.to_i
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.finish_metrics(pid)
|
164
|
+
started = ProcessInfo.started_at( pid )
|
165
|
+
finished = ProcessInfo.finished_at( pid )
|
166
|
+
requested = ProcessInfo.requested_at( pid )
|
167
|
+
|
168
|
+
completion_time = Time.now.to_i
|
169
|
+
|
170
|
+
FileSystem.set finished, completion_time
|
171
|
+
time_to_complete = completion_time - FileSystem.get(started).to_i
|
172
|
+
|
173
|
+
recent_ttcs = ProcessInfo.recent_time_to_complete
|
174
|
+
FileSystem.lpush( recent_ttcs , time_to_complete )
|
175
|
+
FileSystem.ltrim( recent_ttcs , 0, 30 - 1)
|
176
|
+
|
177
|
+
FileSystem.expire finished, 60 * 60
|
178
|
+
FileSystem.expire started, 60 * 60
|
179
|
+
FileSystem.expire requested, 60 * 60
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module MapRedus
|
2
|
+
#
|
3
|
+
# Standard readers for the input and output of Files coming out
|
4
|
+
# of the FileSystem.
|
5
|
+
#
|
6
|
+
class Outputter < QueueProcess
|
7
|
+
def self.decode(result_key)
|
8
|
+
FileSystem.get(result_key)
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.encode(result_key, o)
|
12
|
+
FileSystem.set(result_key, o)
|
13
|
+
end
|
14
|
+
|
15
|
+
#
|
16
|
+
# type should either be "decode" or "encode"
|
17
|
+
#
|
18
|
+
def self.perform(type, o)
|
19
|
+
send(type, o)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class JsonOutputter < Outputter
|
24
|
+
def self.decode(result_key)
|
25
|
+
Helper.decode(FileSystem.get(result_key))
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.encode(result_key, o)
|
29
|
+
FileSystem.set(result_key, Helper.encode(o))
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class RedisHasher < Outputter
|
34
|
+
def self.encode(result_key, k, v)
|
35
|
+
FileSystem.hset(result_key, k, v)
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.decode(result_key, k)
|
39
|
+
FileSystem.hget(result_key, k)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|