mapredus 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README.md +227 -0
- data/lib/mapredus/filesystem.rb +43 -0
- data/lib/mapredus/finalizer.rb +33 -0
- data/lib/mapredus/inputter.rb +31 -0
- data/lib/mapredus/keys.rb +86 -0
- data/lib/mapredus/mapper.rb +27 -0
- data/lib/mapredus/master.rb +182 -0
- data/lib/mapredus/outputter.rb +42 -0
- data/lib/mapredus/process.rb +366 -0
- data/lib/mapredus/reducer.rb +39 -0
- data/lib/mapredus/support.rb +56 -0
- data/lib/mapredus.rb +106 -0
- data/spec/helper.rb +47 -0
- data/spec/helper_classes.rb +102 -0
- data/spec/mapredus_spec.rb +295 -0
- metadata +144 -0
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 Dolores Labs
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,227 @@
|
|
1
|
+
MapRedus
|
2
|
+
=========
|
3
|
+
|
4
|
+
Simple MapReduce type framework using redis and resque.
|
5
|
+
|
6
|
+
Overview
|
7
|
+
--------
|
8
|
+
|
9
|
+
This is an experimental implementation of MapReduce using Ruby for
|
10
|
+
process definition, Resque for work execution, and Redis for data
|
11
|
+
storage.
|
12
|
+
|
13
|
+
Goals:
|
14
|
+
|
15
|
+
* simple M/R-style programming for existing Ruby projects
|
16
|
+
* low cost of entry (no need for a dedicated cluster)
|
17
|
+
|
18
|
+
If you are looking for a high-performance MapReduce implementation
|
19
|
+
that can meet your big data needs, try Hadoop.
|
20
|
+
|
21
|
+
|
22
|
+
Using MapRedus
|
23
|
+
---------------
|
24
|
+
|
25
|
+
MapRedus uses Resque to handle the processes that it runs, and redis
|
26
|
+
to keep a store for the values/data produced.
|
27
|
+
|
28
|
+
Workers for a MapRedus process, are Resque workers. Refer to the
|
29
|
+
Resque worker documentation to see how to load the necessary
|
30
|
+
environment for your worker to be able to run mapreduce processs. An
|
31
|
+
example is also located in the tests.
|
32
|
+
|
33
|
+
### Attaching a mapreduce process to a class
|
34
|
+
Often times you'll want to define a mapreduce process that does
|
35
|
+
operation on data within a class. Here is how this looks. There is
|
36
|
+
also an example of this in the tests.
|
37
|
+
class GetWordCount < MapRedus::Process
|
38
|
+
def self.specification
|
39
|
+
{
|
40
|
+
:inputter => WordStream,
|
41
|
+
:mapper => WordCounter,
|
42
|
+
:reducer => Adder,
|
43
|
+
:finalizer => ToRedisHash,
|
44
|
+
:outputter => MapRedus::RedisHasher,
|
45
|
+
:ordered => false
|
46
|
+
}
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
class Job
|
51
|
+
mapreduce_process :word_count, GetWordCount, "job:store:result"
|
52
|
+
end
|
53
|
+
|
54
|
+
The mapreduce_process needs a name, mapper, reducer, finalizer,
|
55
|
+
outputter, and key to store the result. The operation would then be
|
56
|
+
run on a job calling the following.
|
57
|
+
|
58
|
+
job = Job.new
|
59
|
+
job.mapreduce.word_count( data )
|
60
|
+
|
61
|
+
The data specifies the data on which this operation is to run. We are
|
62
|
+
currently working on a way to allow the result_store_key to change
|
63
|
+
depending on class properties. For instance in the above example, if
|
64
|
+
the Job class had an id attribute, we may want to store the final
|
65
|
+
mapreduce result in "job:store:result:#{id}".
|
66
|
+
|
67
|
+
### Inputters, Mappers, Reducers, Finalizers
|
68
|
+
|
69
|
+
MapRedus needs a input stream, mapper, reducer, finalizer to be
|
70
|
+
defined to run. The input stream defines how a block of your data
|
71
|
+
gets divided so that a mapper can work on a small portion to map. For
|
72
|
+
example:
|
73
|
+
|
74
|
+
class InputStream < MapRedus::InputStream
|
75
|
+
def self.scan(data_object)
|
76
|
+
# your data object is a reference to a block of text in redis
|
77
|
+
text_block = MapRedus.redis.get(data_object)
|
78
|
+
text_block.each_line.each_with_index do |line, i|
|
79
|
+
yield(i, line)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
class Mapper < MapRedus::Mapper
|
85
|
+
def self.map(data_to_map)
|
86
|
+
data_to_map.each do |data|
|
87
|
+
key = data
|
88
|
+
value = 1
|
89
|
+
yield( key, value )
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
In this example, the inputt stream calls yield to output a mapredus
|
95
|
+
file number and a the value that is saved to file (in redis). The
|
96
|
+
mapper's map function calls yield to emit the key value pair for
|
97
|
+
storage in redis. The reducer's reduce function acts similarly.
|
98
|
+
|
99
|
+
The finalizer runs whatever needs to be run when a process completes,
|
100
|
+
an example:
|
101
|
+
|
102
|
+
class Finalizer < MapRedus::Finalizer
|
103
|
+
def self.finalize(process)
|
104
|
+
process.each_key_reduced_value do |key, value|
|
105
|
+
process.outputter.encode(process.keyname, key, value)
|
106
|
+
end
|
107
|
+
...
|
108
|
+
< set off a new mapredus process to use this stored data >
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
The process.keyname refers the final result key that is stored in
|
113
|
+
redis. The outputter is needed to define how exactly that encoding is
|
114
|
+
defined. We provided an outputter that encodes your data into a redis
|
115
|
+
hash.
|
116
|
+
|
117
|
+
class RedisHasher < MapRedus::Outputter
|
118
|
+
def encode(result_key, k, v)
|
119
|
+
MapRedus::FileSystem.hset(result_key, k, v)
|
120
|
+
end
|
121
|
+
|
122
|
+
def decode(result_key, k)
|
123
|
+
MapRedus::FileSystem.hget(result_key, k)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
The default Outputter makes no changes to original result, and tries
|
128
|
+
to store that directly into redis as a string.
|
129
|
+
|
130
|
+
Running Tests
|
131
|
+
-------------
|
132
|
+
|
133
|
+
Run the tests which tests the word counter example and some other
|
134
|
+
tests (you'll need to have bundler installed)
|
135
|
+
rake
|
136
|
+
|
137
|
+
Requirements
|
138
|
+
------------
|
139
|
+
Bundler (this will install all the requirements below)
|
140
|
+
Redis
|
141
|
+
RedisSupport
|
142
|
+
Resque
|
143
|
+
Resque-scheduler
|
144
|
+
|
145
|
+
### Notes
|
146
|
+
Instead of calling "emit_intermediate"/"emit" in your map/reduce
|
147
|
+
to produce a key value pair/value you call yield, which will call
|
148
|
+
emit_intermediate/emit for you. This gives flexibility in using
|
149
|
+
Mapper/Reducer classes especially in testing.
|
150
|
+
|
151
|
+
TODO
|
152
|
+
----
|
153
|
+
not necessarily in the given order
|
154
|
+
|
155
|
+
* if a process fails we do what we are supposed to do i.e. add a
|
156
|
+
failure_hook which does something if your process fails
|
157
|
+
|
158
|
+
* include functionality for a partitioner, input reader, combiner
|
159
|
+
|
160
|
+
* implement this shit (registering of environment shit in resque) so
|
161
|
+
that we can run mapreduce commands from the command line. Defining
|
162
|
+
any arbitrary mapper and reducer.
|
163
|
+
|
164
|
+
* implement redundant workers (workers doing the same work in case one
|
165
|
+
of them fails)
|
166
|
+
|
167
|
+
* if a reducer runs a recoverable fail, then make sure that an attempt
|
168
|
+
to reenslave the worker is delayed by some fixed interval
|
169
|
+
|
170
|
+
* edit emit for when we have multiple workers doing the same reduce
|
171
|
+
(redundant workers for fault tolerance might need to change the
|
172
|
+
rpush to a lock and setting of just a value) even if other workers
|
173
|
+
do work on the same answer, want to make sure that the final reduced
|
174
|
+
thing is the same every time
|
175
|
+
|
176
|
+
* Add fault tolerance, better tracking of which workers fail,
|
177
|
+
especially when we have multiple workers doing the same work
|
178
|
+
... currently is handled by Resque failure auto retry
|
179
|
+
|
180
|
+
* if a perform operation fails then we need to have worker recover
|
181
|
+
|
182
|
+
* make use of finish_metrics somewhere so that we can have statistics
|
183
|
+
on how long map reduce processs take
|
184
|
+
|
185
|
+
* better tracking of work being assigned so we can know when a process is finished
|
186
|
+
or in progress and have a trigger to do things when shit finishes
|
187
|
+
|
188
|
+
in resque there is functionality for an after hook which performs
|
189
|
+
something after your process does it's work
|
190
|
+
|
191
|
+
might also check out the resque-status plugin for a cheap and easy
|
192
|
+
way to plug status and completion-rate into existing resque jobs.
|
193
|
+
|
194
|
+
* ensure reducers only do a fixed amount of work? See section 3.2 of
|
195
|
+
paper. bookkeeping that tells the master when tasks are in-progress
|
196
|
+
or completed. this will be important for better paralleziation of
|
197
|
+
tasks
|
198
|
+
|
199
|
+
* think about the following logic
|
200
|
+
|
201
|
+
if a reducer starts working on a key after all maps have finished
|
202
|
+
then when it is done the work on that key is finished forerver
|
203
|
+
|
204
|
+
this would imply a process finishes when all map tasks have
|
205
|
+
finished and all reduce tasks that start after the map tasks have
|
206
|
+
finished
|
207
|
+
|
208
|
+
if a reducer started before all map tasks were finished, then load
|
209
|
+
its reduced result back onto the value list
|
210
|
+
|
211
|
+
if the reducer started after all map tasks finished, then emit the
|
212
|
+
result
|
213
|
+
|
214
|
+
Note on Patches/Pull Requests
|
215
|
+
-----------------------------
|
216
|
+
|
217
|
+
* Fork the project.
|
218
|
+
* Make your feature addition or bug fix.
|
219
|
+
* Add tests for it. This is important so I don't break it in a
|
220
|
+
future version unintentionally.
|
221
|
+
* Commit, do not mess with rakefile, version, or history. (if you
|
222
|
+
want to have your own version, that is fine but bump version in a
|
223
|
+
commit by itself I can ignore when I pull)
|
224
|
+
* Send me a pull request. Bonus points for topic branches.
|
225
|
+
|
226
|
+
## Copyright
|
227
|
+
Copyright (c) 2010 Dolores Labs. See LICENSE for details.
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module MapRedus
|
2
|
+
# Manages the book keeping of redis keys and redis usage
|
3
|
+
# provides the data storage for process information through redis
|
4
|
+
# All interaction with redis should go through this class
|
5
|
+
#
|
6
|
+
class FileSystem
|
7
|
+
def self.storage
|
8
|
+
MapRedus.redis
|
9
|
+
end
|
10
|
+
|
11
|
+
# Save/Read functions to save/read values for a redis key
|
12
|
+
#
|
13
|
+
# Examples
|
14
|
+
# FileSystem.save( key, value )
|
15
|
+
def self.save(key, value, time = nil)
|
16
|
+
storage.set(key, value)
|
17
|
+
storage.expire(key, time) if time
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.method_missing(method, *args, &block)
|
21
|
+
storage.send(method, *args)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Setup locks on results using RedisSupport lock functionality
|
25
|
+
#
|
26
|
+
# Examples
|
27
|
+
# FileSystem::has_lock?(keyname)
|
28
|
+
# # => true or false
|
29
|
+
#
|
30
|
+
# Returns true if there's a lock
|
31
|
+
def self.has_lock?(keyname)
|
32
|
+
MapRedus.has_redis_lock?( RedisKey.result_cache(keyname) )
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.acquire_lock(keyname)
|
36
|
+
MapRedus.acquire_redis_lock_nonblock( RedisKey.result_cache(keyname), 60 * 60 )
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.release_lock(keyname)
|
40
|
+
MapRedus.release_redis_lock( RedisKey.result_cache(keyname) )
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module MapRedus
|
2
|
+
# Run the stuff you want to run at the end of the process.
|
3
|
+
# Define subclass which defines self.finalize and self.serialize
|
4
|
+
# to do what is needed when you want to get the final output
|
5
|
+
# out of redis and into ruby.
|
6
|
+
#
|
7
|
+
# This is basically the message back to the user program that a
|
8
|
+
# process is completed storing the necessary info.
|
9
|
+
#
|
10
|
+
class Finalizer < QueueProcess
|
11
|
+
|
12
|
+
# The default finalizer is to notify of process completion
|
13
|
+
#
|
14
|
+
# Example
|
15
|
+
# Finalizer::finalize(pid)
|
16
|
+
# # => "MapRedus Process : 111 : has completed"
|
17
|
+
#
|
18
|
+
# Returns a message notification
|
19
|
+
def self.finalize(pid)
|
20
|
+
"MapRedus Process : #{pid} : has completed"
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.perform(pid)
|
24
|
+
process = Process.open(pid)
|
25
|
+
result = finalize(process)
|
26
|
+
Master.finish_metrics(pid)
|
27
|
+
result
|
28
|
+
ensure
|
29
|
+
Master.free_slave(pid)
|
30
|
+
process.next_state
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module MapRedus
|
2
|
+
class InputStream < QueueProcess
|
3
|
+
#
|
4
|
+
# An InputSteam needs to implement a way to scan through the
|
5
|
+
# data_object (the object data that is sent to the MapRedus
|
6
|
+
# process). The scan function implements how the data object is
|
7
|
+
# broken sizable pieces for the mappers to operate on.
|
8
|
+
#
|
9
|
+
# It does this by yielding a <key, map_data> pair. The key
|
10
|
+
# specifies the location storage in redis. map_data is string
|
11
|
+
# data that will be written to the redis.
|
12
|
+
#
|
13
|
+
# Example
|
14
|
+
# scan(data_object) do |key, map_data|
|
15
|
+
# ...
|
16
|
+
# end
|
17
|
+
def self.scan(data_object)
|
18
|
+
raise InvalidInputStream
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.perform(pid, data_object)
|
22
|
+
process = Process.open(pid)
|
23
|
+
scan(data_object) do |key, map_data|
|
24
|
+
FileSystem.hset(ProcessInfo.input(pid), key, map_data)
|
25
|
+
Master.enslave_map(process, key)
|
26
|
+
end
|
27
|
+
ensure
|
28
|
+
Master.free_slave(pid)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module MapRedus
|
2
|
+
RedisKey = MapRedus::Keys
|
3
|
+
ProcessInfo = RedisKey
|
4
|
+
|
5
|
+
#### USED WITHIN process.rb ####
|
6
|
+
|
7
|
+
# Holds the current map reduce processes that are either running or which still have data lying around
|
8
|
+
#
|
9
|
+
redis_key :processes, "mapredus:processes"
|
10
|
+
redis_key :processes_count, "mapredus:processes:count"
|
11
|
+
|
12
|
+
# Holds the information (mapper, reducer, etc.) in json format for a map reduce process with pid PID
|
13
|
+
#
|
14
|
+
redis_key :pid, "mapredus:process:PID"
|
15
|
+
|
16
|
+
# The input blocks broken down by the InputStream
|
17
|
+
redis_key :input, "mapredus:process:PID:input"
|
18
|
+
|
19
|
+
# All the keys that the map produced
|
20
|
+
#
|
21
|
+
redis_key :keys, "mapredus:process:PID:keys"
|
22
|
+
|
23
|
+
# The hashed key to actual string value of key
|
24
|
+
#
|
25
|
+
redis_key :hash_to_key, "mapredus:process:PID:keys:HASHED_KEY" # to ACTUAL KEY
|
26
|
+
|
27
|
+
# The list of values for a given key generated by our map function.
|
28
|
+
# When a reduce is run it takes elements from this key and pushes them to :reduce
|
29
|
+
#
|
30
|
+
# key - list of values
|
31
|
+
#
|
32
|
+
redis_key :map, "mapredus:process:PID:map_key:HASHED_KEY"
|
33
|
+
redis_key :reduce, "mapredus:process:PID:map_key:HASHED_KEY:reduce"
|
34
|
+
|
35
|
+
# Temporary redis space for reduce functions to use
|
36
|
+
#
|
37
|
+
redis_key :temp, "mapredus:process:PID:temp_reduce_key:HASHED_KEY:UNIQUE_REDUCE_HOSTNAME:UNIQUE_REDUCE_PROCESS_ID"
|
38
|
+
|
39
|
+
# If we want to hold on to our final data we have a key to put that data in
|
40
|
+
# In normal map reduce we would just be outputting files
|
41
|
+
#
|
42
|
+
redis_key :result, "mapredus:process:PID:result"
|
43
|
+
redis_key :result_cache, "mapredus:result:KEYNAME"
|
44
|
+
|
45
|
+
|
46
|
+
#### USED WITHIN master.rb ####
|
47
|
+
|
48
|
+
# Keeps track of the current slaves (by appending "1" to a redis list)
|
49
|
+
#
|
50
|
+
# TODO: should append some sort of proper process id so we can explicitly keep track
|
51
|
+
# of processes
|
52
|
+
#
|
53
|
+
redis_key :slaves, "mapredus:process:PID:master:slaves"
|
54
|
+
|
55
|
+
#
|
56
|
+
# Use these constants to keep track of the progress of a process
|
57
|
+
#
|
58
|
+
# Example
|
59
|
+
# state => map_in_progress
|
60
|
+
# reduce_in_progress
|
61
|
+
# finalize_in_progress
|
62
|
+
# complete
|
63
|
+
# failed
|
64
|
+
# not_started
|
65
|
+
#
|
66
|
+
# contained in the ProcessInfo hash (redis_key :state, "mapredus:process:PID:master:state")
|
67
|
+
#
|
68
|
+
NOT_STARTED = "not_started"
|
69
|
+
INPUT_MAP_IN_PROGRESS = "mappers"
|
70
|
+
REDUCE_IN_PROGRESS = "reducers"
|
71
|
+
FINALIZER_IN_PROGRESS = "finalizer"
|
72
|
+
COMPLETE = "complete"
|
73
|
+
FAILED = "failed"
|
74
|
+
STATE_MACHINE = { nil => NOT_STARTED,
|
75
|
+
NOT_STARTED => INPUT_MAP_IN_PROGRESS,
|
76
|
+
INPUT_MAP_IN_PROGRESS => REDUCE_IN_PROGRESS,
|
77
|
+
REDUCE_IN_PROGRESS => FINALIZER_IN_PROGRESS,
|
78
|
+
FINALIZER_IN_PROGRESS => COMPLETE}
|
79
|
+
|
80
|
+
# These keep track of timing information for a map reduce process of pid PID
|
81
|
+
#
|
82
|
+
redis_key :requested_at, "mapredus:process:PID:request_at"
|
83
|
+
redis_key :started_at, "mapredus:process:PID:started_at"
|
84
|
+
redis_key :finished_at, "mapredus:process:PID:finished_at"
|
85
|
+
redis_key :recent_time_to_complete, "mapredus:process:recent_time_to_complete"
|
86
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module MapRedus
|
2
|
+
# Map is a function that takes a data chunk
|
3
|
+
# where each data chunk is a list of pieces of your raw data
|
4
|
+
# and emits a list of key, value pairs.
|
5
|
+
#
|
6
|
+
# The output of the map shall always be
|
7
|
+
# [ [key, value], [key, value], ... ]
|
8
|
+
#
|
9
|
+
# Note: Values must be string, integers, booleans, or floats.
|
10
|
+
# i.e., They must be primitive types since these are the only
|
11
|
+
# types that redis supports and since anything inputted into
|
12
|
+
# redis becomes a string.
|
13
|
+
class Mapper < QueueProcess
|
14
|
+
def self.map(data_chunk); raise InvalidMapper; end
|
15
|
+
|
16
|
+
def self.perform(pid, data_key)
|
17
|
+
process = Process.open(pid)
|
18
|
+
data_chunk = FileSystem.hget(ProcessInfo.input(pid), data_key)
|
19
|
+
map( data_chunk ) do |*key_value|
|
20
|
+
process.emit_intermediate(*key_value)
|
21
|
+
end
|
22
|
+
ensure
|
23
|
+
Master.free_slave(pid)
|
24
|
+
process.next_state
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
module MapRedus
|
2
|
+
# Note: Instead of using Resque directly within the process, we implement
|
3
|
+
# a master interface with Resque
|
4
|
+
#
|
5
|
+
# Does bookkeeping to keep track of how many slaves are doing work. If we have
|
6
|
+
# no slaves doing work for a process then the process is done. While there is work available
|
7
|
+
# the slaves will always be doing work.
|
8
|
+
#
|
9
|
+
class Master < QueueProcess
|
10
|
+
# Check whether there are still workers working on process PID's processes
|
11
|
+
#
|
12
|
+
# In synchronous condition, master is always working since nothing is going to
|
13
|
+
# the queue.
|
14
|
+
def self.working?(pid)
|
15
|
+
0 < FileSystem.llen(ProcessInfo.slaves(pid))
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# Master performs the work that it needs to do:
|
20
|
+
# it must free itself as a slave from Resque
|
21
|
+
# enslave mappers
|
22
|
+
#
|
23
|
+
def self.perform( pid, data_object )
|
24
|
+
process = Process.open(pid)
|
25
|
+
enslave_inputter(process, data_object)
|
26
|
+
process.update(:state => INPUT_MAP_IN_PROGRESS)
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# The order of operations that occur in the mapreduce process
|
31
|
+
#
|
32
|
+
# The inputter sets off the mapper processes
|
33
|
+
#
|
34
|
+
def self.mapreduce( process, data_object )
|
35
|
+
start_metrics(process.pid)
|
36
|
+
if process.synchronous
|
37
|
+
process.update(:state => INPUT_MAP_IN_PROGRESS)
|
38
|
+
enslave_inputter(process, data_object)
|
39
|
+
process.update(:state => REDUCE_IN_PROGRESS)
|
40
|
+
enslave_reducers(process)
|
41
|
+
process.update(:state => FINALIZER_IN_PROGRESS)
|
42
|
+
enslave_finalizer(process)
|
43
|
+
else
|
44
|
+
Resque.push(QueueProcess.queue, {:class => MapRedus::Master , :args => [process.pid, data_object]} )
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.enslave_inputter(process, data_object)
|
49
|
+
enslave( process, process.inputter, process.pid, data_object )
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.enslave_reducers( process )
|
53
|
+
process.map_keys.each do |key|
|
54
|
+
enslave_reduce( process, key )
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.enslave_finalizer( process )
|
59
|
+
enslave( process, process.finalizer, process.pid )
|
60
|
+
end
|
61
|
+
|
62
|
+
# Have these to match what the Mapper/Reducer perform function expects to see as arguments
|
63
|
+
#
|
64
|
+
# though instead of process the perform function will receive the pid
|
65
|
+
def self.enslave_map(process, data_chunk)
|
66
|
+
enslave( process, process.mapper, process.pid, data_chunk )
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.enslave_reduce(process, key)
|
70
|
+
enslave( process, process.reducer, process.pid, key )
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.enslave_later_reduce(process, key)
|
74
|
+
enslave_later( process.reducer.wait, process, process.reducer, process.pid, key )
|
75
|
+
end
|
76
|
+
|
77
|
+
# The current default (QUEUE) that we push on to is
|
78
|
+
# :mapredus
|
79
|
+
#
|
80
|
+
def self.enslave( process, klass, *args )
|
81
|
+
FileSystem.rpush(ProcessInfo.slaves(process.pid), 1)
|
82
|
+
|
83
|
+
if( process.synchronous )
|
84
|
+
klass.perform(*args)
|
85
|
+
else
|
86
|
+
Resque.push( klass.queue, { :class => klass.to_s, :args => args } )
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.enslave_later( delay_in_seconds, process, klass, *args)
|
91
|
+
FileSystem.rpush(ProcessInfo.slaves(process.pid), 1)
|
92
|
+
|
93
|
+
if( process.synchronous )
|
94
|
+
klass.perform(*args)
|
95
|
+
else
|
96
|
+
#
|
97
|
+
# TODO: I cannot get enqueue_in to work with my tests
|
98
|
+
# there seems to be a silent failure somewhere
|
99
|
+
# in the tests such that it never calls the function
|
100
|
+
# and the queue gets emptied
|
101
|
+
#
|
102
|
+
# Resque.enqueue_in(delay_in_seconds, klass, *args)
|
103
|
+
|
104
|
+
##
|
105
|
+
## Temporary, immediately just push process back onto the resque queue
|
106
|
+
Resque.push( klass.queue, { :class => klass.to_s, :args => args } )
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.slaves(pid)
|
111
|
+
FileSystem.lrange(ProcessInfo.slaves(pid), 0, -1)
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.free_slave(pid)
|
115
|
+
FileSystem.lpop(ProcessInfo.slaves(pid))
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.emancipate(pid)
|
119
|
+
process = Process.open(pid)
|
120
|
+
return unless process
|
121
|
+
|
122
|
+
# Working on resque directly seems dangerous
|
123
|
+
#
|
124
|
+
# Warning: this is supposed to be used as a debugging operation
|
125
|
+
# and isn't intended for normal use. It is potentially very expensive.
|
126
|
+
#
|
127
|
+
destroyed = 0
|
128
|
+
qs = [queue, process.mapper.queue, process.reducer.queue, process.finalizer.queue].uniq
|
129
|
+
qs.each do |q|
|
130
|
+
q_key = "queue:#{q}"
|
131
|
+
Resque.redis.lrange(q_key, 0, -1).each do | string |
|
132
|
+
json = Helper.decode(string)
|
133
|
+
match = json['class'] == "MapRedus::Master"
|
134
|
+
match |= json['class'] == process.inputter.to_s
|
135
|
+
match |= json['class'] == process.mapper.to_s
|
136
|
+
match |= json['class'] == process.reducer.to_s
|
137
|
+
match |= json['class'] == process.finalizer.to_s
|
138
|
+
match &= json['args'].first.to_s == process.pid.to_s
|
139
|
+
if match
|
140
|
+
destroyed += Resque.redis.lrem(q_key, 0, string).to_i
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
#
|
146
|
+
# our slave information is kept track of on file and not in Resque
|
147
|
+
#
|
148
|
+
FileSystem.del(ProcessInfo.slaves(pid))
|
149
|
+
destroyed
|
150
|
+
end
|
151
|
+
|
152
|
+
# Time metrics for measuring how long it takes map reduce to do a process
|
153
|
+
#
|
154
|
+
def self.set_request_time(pid)
|
155
|
+
FileSystem.set( ProcessInfo.requested_at(pid), Time.now.to_i )
|
156
|
+
end
|
157
|
+
|
158
|
+
def self.start_metrics(pid)
|
159
|
+
started = ProcessInfo.started_at( pid )
|
160
|
+
FileSystem.set started, Time.now.to_i
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.finish_metrics(pid)
|
164
|
+
started = ProcessInfo.started_at( pid )
|
165
|
+
finished = ProcessInfo.finished_at( pid )
|
166
|
+
requested = ProcessInfo.requested_at( pid )
|
167
|
+
|
168
|
+
completion_time = Time.now.to_i
|
169
|
+
|
170
|
+
FileSystem.set finished, completion_time
|
171
|
+
time_to_complete = completion_time - FileSystem.get(started).to_i
|
172
|
+
|
173
|
+
recent_ttcs = ProcessInfo.recent_time_to_complete
|
174
|
+
FileSystem.lpush( recent_ttcs , time_to_complete )
|
175
|
+
FileSystem.ltrim( recent_ttcs , 0, 30 - 1)
|
176
|
+
|
177
|
+
FileSystem.expire finished, 60 * 60
|
178
|
+
FileSystem.expire started, 60 * 60
|
179
|
+
FileSystem.expire requested, 60 * 60
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module MapRedus
|
2
|
+
#
|
3
|
+
# Standard readers for the input and output of Files coming out
|
4
|
+
# of the FileSystem.
|
5
|
+
#
|
6
|
+
class Outputter < QueueProcess
|
7
|
+
def self.decode(result_key)
|
8
|
+
FileSystem.get(result_key)
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.encode(result_key, o)
|
12
|
+
FileSystem.set(result_key, o)
|
13
|
+
end
|
14
|
+
|
15
|
+
#
|
16
|
+
# type should either be "decode" or "encode"
|
17
|
+
#
|
18
|
+
def self.perform(type, o)
|
19
|
+
send(type, o)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class JsonOutputter < Outputter
|
24
|
+
def self.decode(result_key)
|
25
|
+
Helper.decode(FileSystem.get(result_key))
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.encode(result_key, o)
|
29
|
+
FileSystem.set(result_key, Helper.encode(o))
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class RedisHasher < Outputter
|
34
|
+
def self.encode(result_key, k, v)
|
35
|
+
FileSystem.hset(result_key, k, v)
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.decode(result_key, k)
|
39
|
+
FileSystem.hget(result_key, k)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|