scbi_mapreduce 0.0.29

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. data/History.txt +49 -0
  2. data/Manifest.txt +46 -0
  3. data/PostInstall.txt +7 -0
  4. data/README.rdoc +295 -0
  5. data/Rakefile +28 -0
  6. data/bin/scbi_mapreduce +52 -0
  7. data/lib/scbi_mapreduce.rb +15 -0
  8. data/lib/scbi_mapreduce/error_handler.rb +15 -0
  9. data/lib/scbi_mapreduce/main_worker.rb +50 -0
  10. data/lib/scbi_mapreduce/manager.rb +110 -0
  11. data/lib/scbi_mapreduce/work_manager.rb +405 -0
  12. data/lib/scbi_mapreduce/worker.rb +163 -0
  13. data/lib/scbi_mapreduce/worker_launcher.rb +96 -0
  14. data/lib/scbi_mapreduce/zlib_serializer.rb +32 -0
  15. data/script/console +10 -0
  16. data/script/destroy +14 -0
  17. data/script/generate +14 -0
  18. data/skeleton/dummy_calcs/README.txt +25 -0
  19. data/skeleton/dummy_calcs/lib/calculations.rb +37 -0
  20. data/skeleton/dummy_calcs/lib/thread_pool.rb +107 -0
  21. data/skeleton/dummy_calcs/linear_implementation.rb +22 -0
  22. data/skeleton/dummy_calcs/main.rb +67 -0
  23. data/skeleton/dummy_calcs/my_worker.rb +56 -0
  24. data/skeleton/dummy_calcs/my_worker_manager.rb +52 -0
  25. data/skeleton/dummy_calcs/threads_implementation.rb +33 -0
  26. data/skeleton/remove_mids/README.txt +30 -0
  27. data/skeleton/remove_mids/launch_only_workers.rb +29 -0
  28. data/skeleton/remove_mids/lib/db/mids.fasta +120 -0
  29. data/skeleton/remove_mids/lib/find_mids.rb +191 -0
  30. data/skeleton/remove_mids/lib/global_match.rb +97 -0
  31. data/skeleton/remove_mids/linear_implementation.rb +87 -0
  32. data/skeleton/remove_mids/main.rb +89 -0
  33. data/skeleton/remove_mids/my_worker.rb +59 -0
  34. data/skeleton/remove_mids/my_worker_manager.rb +68 -0
  35. data/skeleton/simple/README.txt +16 -0
  36. data/skeleton/simple/main.rb +41 -0
  37. data/skeleton/simple/my_worker.rb +53 -0
  38. data/skeleton/simple/my_worker_manager.rb +55 -0
  39. data/test/drb_test/main.rb +31 -0
  40. data/test/drb_test/my_worker.rb +36 -0
  41. data/test/drb_test/my_worker_manager.rb +41 -0
  42. data/test/drb_test/scbi_drb_checkpoint +1 -0
  43. data/test/drb_test/scbi_mapreduce_checkpoint +1 -0
  44. data/test/test_helper.rb +3 -0
  45. data/test/test_scbi_drb.rb +11 -0
  46. metadata +127 -0
data/History.txt ADDED
@@ -0,0 +1,49 @@
1
+ === 0.0.29 2011-06-13
2
+
3
+ First rubygems release
4
+
5
+ === 0.0.21 2011-05-19
6
+
7
+ added zlib serialization
8
+
9
+ === 0.0.20 2011-05-18
10
+
11
+ Own serializer
12
+
13
+ === 0.0.19 2011-05-11
14
+
15
+ Added dummy_calculations skeleton
16
+
17
+ === 0.0.18 2011-05-11
18
+
19
+ Added fibo skeleton
20
+
21
+ === 0.0.17 2011-05-09
22
+
23
+ New sequence Skeleton
24
+
25
+ === 0.0.16 2011-05-05
26
+
27
+ Automatically create log directory
28
+
29
+ === 0.0.4 2010-08-26
30
+
31
+ * 1 minor fix:
32
+ * add initial config interchange between server and workers
33
+
34
+
35
+ === 0.0.3 2010-08-24
36
+
37
+ * 1 minor fix:
38
+ * changed start_worker from post_init
39
+
40
+
41
+ === 0.0.2 2010-08-06
42
+
43
+ * 1 minor fix:
44
+ * changed logs names
45
+
46
+ === 0.0.1 2010-06-11
47
+
48
+ * 1 major enhancement:
49
+ * Initial release
data/Manifest.txt ADDED
@@ -0,0 +1,46 @@
1
+ History.txt
2
+ lib/scbi_mapreduce/error_handler.rb
3
+ lib/scbi_mapreduce/main_worker.rb
4
+ lib/scbi_mapreduce/manager.rb
5
+ lib/scbi_mapreduce/work_manager.rb
6
+ lib/scbi_mapreduce/worker.rb
7
+ lib/scbi_mapreduce/zlib_serializer.rb
8
+ lib/scbi_mapreduce/worker_launcher.rb
9
+ lib/scbi_mapreduce.rb
10
+ Manifest.txt
11
+ PostInstall.txt
12
+ Rakefile
13
+ README.rdoc
14
+ script/console
15
+ script/destroy
16
+ script/generate
17
+ test/drb_test/logs
18
+ test/drb_test/main.rb
19
+ test/drb_test/my_worker.rb
20
+ test/drb_test/my_worker_manager.rb
21
+ test/drb_test/scbi_drb_checkpoint
22
+ test/drb_test/scbi_mapreduce_checkpoint
23
+ test/test_helper.rb
24
+ test/test_scbi_drb.rb
25
+ bin/scbi_mapreduce
26
+ skeleton/simple/main.rb
27
+ skeleton/simple/my_worker.rb
28
+ skeleton/simple/my_worker_manager.rb
29
+ skeleton/simple/README.txt
30
+ skeleton/remove_mids/launch_only_workers.rb
31
+ skeleton/remove_mids/lib/db/mids.fasta
32
+ skeleton/remove_mids/lib/find_mids.rb
33
+ skeleton/remove_mids/lib/global_match.rb
34
+ skeleton/remove_mids/linear_implementation.rb
35
+ skeleton/remove_mids/main.rb
36
+ skeleton/remove_mids/my_worker.rb
37
+ skeleton/remove_mids/my_worker_manager.rb
38
+ skeleton/remove_mids/README.txt
39
+ skeleton/dummy_calcs/lib/calculations.rb
40
+ skeleton/dummy_calcs/lib/thread_pool.rb
41
+ skeleton/dummy_calcs/linear_implementation.rb
42
+ skeleton/dummy_calcs/main.rb
43
+ skeleton/dummy_calcs/my_worker.rb
44
+ skeleton/dummy_calcs/my_worker_manager.rb
45
+ skeleton/dummy_calcs/README.txt
46
+ skeleton/dummy_calcs/threads_implementation.rb
data/PostInstall.txt ADDED
@@ -0,0 +1,7 @@
1
+
2
+ For more information on scbi_mapreduce, see http://scbi_mapreduce.rubyforge.org
3
+
4
+ NOTE: Change this information in PostInstall.txt
5
+ You can also delete it if you don't want it.
6
+
7
+
data/README.rdoc ADDED
@@ -0,0 +1,295 @@
1
+ = scbi_mapreduce
2
+
3
+ * http://www.scbi.uma.es/downloads
4
+
5
+ == DESCRIPTION:
6
+
7
+ scbi_mapreduce brings parallel and distributed computing capabilities to your code, with a very easy to use framework that allows you to exploit your clustered or cloud computational resources.
8
+
9
+ == FEATURES:
10
+
11
+ scbi_mapreduce provides a black boxed distributed programming. Users only need to code some predefined methods in order to achieve distribution. Programming remains sequential at user level (this avoids the hassle of threads or processes handling).
12
+
13
+ When a project using scbi_mapreduce is run, a Manager process and a bunch of workers are created (workers can be in different machines). Manager will dispatch new data to available workers (mapping phase), each worker receives its data, manipulates it and returns the data again to Manager that will aggregate it as desired (reducction phase).
14
+
15
+ The manager is always waiting for workers connections or requests. When a new worker connects, it automatically receives some initial params from the server. After the initial configuration, each worker receives a first chunk of work data. Once a worker has done its job with the received data, it sends the results back to the manager, the manager saves the data, and sends a new assignment to the worker. This process is repeated until manager doesn’t have more data to be processed.
16
+
17
+ === Some cool features of scbi_mapreduce are:
18
+
19
+ - Automatic project creation using a generator and templates (you only need to modify some methods since a scaffold is automatically created for you)
20
+ - Variable data-chunksizes: data can be grouped on variable size chunks in order to optimize network transfers and processing
21
+ - Fixed order: order of input data can be maintained after the parallel execution (uses a cache to store out of order data until it is needed)
22
+ - Checkpoint: current processing status can be committed to disk allowing to retake the execution of an interrupted job at the last committed point
23
+ - Compression: data transfers can be automatically compressed
24
+ - Encription: data transfers can be automatically encripted
25
+
26
+ === Worker-specific features:
27
+
28
+ - Workers are automatically spawned over the cluster (be sure to configure automatic login via ssh with ssh keys)
29
+ - Additional workers can be launched/stopped at any time
30
+ - Workers can be executed over a mixture of architectures and operating systems simultaneously (x86 64, ia64, i686 - OSX, Linux, UNIX)
31
+ - Workers of different speeds works at full capacity all the time, without producing delays on faster workers
32
+ - scbi_mapreduce uses tcp/ip and because of that it can be used over a wide variety of interconnection networks (ethernet, Gigabit, InfinyBand, Myrinet, optic-fiber with ip, etc...), and of course, over the internet (although performance will be restricted by network latency and speed)
33
+ - High work throughput. About 18000 works (1 kb of data) per second with a single core manager
34
+ - Number of workers is highly scalable. Done tests with up to 80 distributed cores.
35
+ - Same solution works on standalone machines, clusters, cloud, SMP machines, or a mixture of them
36
+
37
+ === Other features
38
+
39
+ - Exhaustive log option: manager and by worker logs are very useful at development stages
40
+ - Processing stats: scbi_mapreduce calculates individual performance statistics for each worker and a global one for manager process.
41
+ - scbi_mapreduce makes use of evented IO (EventMachine) being efficient regarding to networked I/O operations
42
+ - Reduced disk I/O: data is read only once, subsequent transfers and splitting are done in RAM (this is very appropriate when disk I/O is already quoted in Cloud or pay per use services)
43
+ - There is no need to use shared storage, (although the software must be installed on all worker machines)
44
+ - Worker error handling: when an exception raises in a worker, it is reported to manager, where it can be handled appropriately
45
+ - High error rate aborting: if a high error rate is detected, execution is aborted in order to preserve computational resources so the user don’t need to execute the whole dataset to find that there was a programming mistake (very useful with pay per use services)
46
+
47
+
48
+ scbi_mapreduce has been tested on production with PBS and Moab/Slurm queue systems, but it can be easily adapted to other ones.
49
+
50
+ == SYNOPSIS:
51
+
52
+ scbi_mapreduce provides an automated code generator like rails. To use it, you only need to issue this command:
53
+
54
+ scbi_mapreduce app_name template
55
+
56
+ E.g.: To create a simple app demo (other templates are avaiable, to list them execute scbi_mapreduce without arguments):
57
+
58
+ scbi_mapreduce my_app simple
59
+
60
+ A full project template will be created for you with (at least) the following files:
61
+
62
+ my_app/main.rb
63
+ my_app/my_worker.rb
64
+ my_app/my_worker_manager.rb
65
+ my_app/README.txt
66
+
67
+
68
+ You can run main.rb as any other ruby script.
69
+
70
+ cd my_app
71
+ ruby main.rb
72
+
73
+ Now that evething is working, you must modify +my_worker+ and +my_worker_manager+ in order to do the desired work.
74
+
75
+ === my_worker_manager.rb
76
+
77
+ In my_worker_manager you open input files, split data in chunks that are automatically sent to workers, and later on writes down data to disk when workers finished them. Here are the basic methods that can be personalized.
78
+
79
+ The most important ones are +next_work+ (where data is splitted into chunks), and +work_received+ (where processed data is received from workers):
80
+
81
+ # next_work method is called every time a worker needs a new work
82
+ # Here you can read data from disk
83
+ # This method must return the work data or nil if no more data is available
84
+ def next_work
85
+ @@remaining_data -= 1
86
+
87
+ e = @@basic_string
88
+
89
+ e = nil if @@remaining_data<0
90
+ return e
91
+
92
+ end
93
+
94
+ -
95
+
96
+ # work_received is executed each time a worker has finished a job.
97
+ # Here you can write results down to disk, perform some aggregated statistics, etc...
98
+ def work_received(results)
99
+
100
+ # write_data_to_disk(results)
101
+ end
102
+
103
+
104
+ There are also some other methods that can be used to send initial configuration parameters, open and close files, etc...
105
+
106
+ # init_work_manager is executed at the start, prior to any processing.
107
+ # You can use init_work_manager to initialize global variables, open files, etc...
108
+ # Note that an instance of MyWorkerManager will be created for each
109
+ # worker connection, and thus, all global variables here should be
110
+ # class variables (starting with @@)
111
+ def self.init_work_manager
112
+
113
+ # use 200000 strings
114
+ @@remaining_data = 200000
115
+
116
+ # of 1024 characters each
117
+ @@basic_string='a'*1024
118
+
119
+ end
120
+
121
+ -
122
+
123
+ # end_work_manager is executed at the end, when all the process is done.
124
+ # You can use it to close files opened in init_work_manager
125
+ def self.end_work_manager
126
+
127
+ end
128
+
129
+ -
130
+
131
+ # worker_initial_config is used to send initial parameters to workers.
132
+ # The method is executed once per each worker
133
+ def worker_initial_config
134
+
135
+ end
136
+
137
+
138
+ === my_worker.rb
139
+
140
+ The main method that needs to be modified on my_worker.rb is +process_object+. It is executed each time new data is available, and is where the real distributed processing takes place since it is executed simultaneously on different machines.
141
+
142
+ def process_object(objs)
143
+
144
+ # iterate over all objects received
145
+ objs.each do |obj|
146
+ # convert to uppercase
147
+ obj.upcase!
148
+ end
149
+
150
+ # return objs back to manager
151
+ return objs
152
+ end
153
+
154
+
155
+ There are other useful methods:
156
+
157
+ # starting_worker method is called one time at initialization
158
+ # and allows you to initialize your variables
159
+ def starting_worker
160
+
161
+ # You can use worker logs at any time in this way:
162
+ # $WORKER_LOG.info "Starting a worker"
163
+
164
+ end
165
+
166
+ -
167
+
168
+ # receive_initial_config is called only once just after
169
+ # the first connection, when initial parameters are
170
+ # received from manager
171
+ def receive_initial_config(parameters)
172
+
173
+ # Reads the parameters
174
+
175
+ # You can use worker logs at any time in this way:
176
+ # $WORKER_LOG.info "Params received"
177
+
178
+ # save received parameters, if any
179
+ # @params = parameters
180
+ end
181
+
182
+ -
183
+
184
+ # process_object method is called for each received object.
185
+ # Be aware that objs is always an array, and you must iterate
186
+ # over it if you need to process it independently
187
+ #
188
+ # The value returned here will be received by the work_received
189
+ # method at your worker_manager subclass.
190
+ def process_object(objs)
191
+
192
+ # iterate over all objects received
193
+ objs.each do |obj|
194
+
195
+ # convert to uppercase
196
+ obj.upcase!
197
+ end
198
+
199
+ # return objs back to manager
200
+ return objs
201
+ end
202
+
203
+ -
204
+
205
+ # called once, when the worker is about to be closed
206
+ def closing_worker
207
+
208
+ end
209
+
210
+ === main.rb
211
+
212
+
213
+ On main.rb is where the manager and workers are launched. Here you define listening ip.
214
+
215
+ # listen on all ips at port 50000
216
+ ip='0.0.0.0'
217
+ port = 50000
218
+
219
+ If you are using a cluster and thus don't know where manager will be executed, you can specify the initial part of the ip interface. Eg.: if you specify ip='10.16', scbi_mapreduce will use the network interface that matches this ip:
220
+
221
+ The number of workers can be a number (workers are launched on the same machine than Manager), or a list of machine names, in which case workers are launched via ssh on remote machines and automatically connected to Manager.
222
+
223
+ # set number of workers. You can also provide an array with worker names.
224
+ # Those workers names can be read from a file produced by the existing
225
+ # queue system, if any.
226
+ workers = 8
227
+
228
+ Your worker file will be used to launch workers.
229
+
230
+ # we need the path to my_worker in order to launch it when necessary
231
+ custom_worker_file = File.join(File.dirname(__FILE__),'my_worker.rb')
232
+
233
+ # initialize the work manager. Here you can pass parameters like file names
234
+ MyWorkerManager.init_work_manager
235
+
236
+ # launch processor server
237
+ mgr = ScbiMapreduce::Manager.new(ip, port, workers, MyWorkerManager, custom_worker_file, STDOUT)
238
+
239
+ You can also set additional properties:
240
+
241
+
242
+ # if you want basic checkpointing. Some performance drop should be expected
243
+ # mgr.checkpointing=true
244
+
245
+ # if you want to keep the order of input data. Some performance drop should be expected
246
+ # mgr.keep_order=true
247
+
248
+ # you can set the size of packets of data sent to workers
249
+ mgr.chunk_size=100
250
+
251
+
252
+ And finally, start the server:
253
+
254
+ # start processing
255
+ mgr.start_server
256
+
257
+
258
+ # this line is reached when all data has been processed
259
+ puts "Program finished"
260
+
261
+
262
+ == REQUIREMENTS:
263
+
264
+ * Ruby 1.9.2 (you can install it by: rvm install 1.9.2)
265
+ * OSX, Linux, UNIX and other UNIX-like operating systems. (Windows may work if ssh is available to spawn jobs. Not tested)
266
+ * eventmachine gem (is automatically installed)
267
+
268
+ == INSTALL:
269
+
270
+ * gem install scbi_mapreduce
271
+
272
+ == LICENSE:
273
+
274
+ (The MIT License)
275
+
276
+ Copyright (c) 2010 Dario Guerrero
277
+
278
+ Permission is hereby granted, free of charge, to any person obtaining
279
+ a copy of this software and associated documentation files (the
280
+ 'Software'), to deal in the Software without restriction, including
281
+ without limitation the rights to use, copy, modify, merge, publish,
282
+ distribute, sublicense, and/or sell copies of the Software, and to
283
+ permit persons to whom the Software is furnished to do so, subject to
284
+ the following conditions:
285
+
286
+ The above copyright notice and this permission notice shall be
287
+ included in all copies or substantial portions of the Software.
288
+
289
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
290
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
291
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
292
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
293
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
294
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
295
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,28 @@
1
+ require 'rubygems'
2
+ gem 'hoe', '>= 2.1.0'
3
+ require 'hoe'
4
+ require 'fileutils'
5
+ require './lib/scbi_mapreduce'
6
+
7
+ Hoe.plugin :newgem
8
+ # Hoe.plugin :website
9
+ # Hoe.plugin :cucumberfeatures
10
+
11
+ # Generate all the Rake tasks
12
+ # Run 'rake -T' to see list of generated tasks (from gem root directory)
13
+ $hoe = Hoe.spec 'scbi_mapreduce' do
14
+ self.developer 'Dario Guerrero', 'dariogf@gmail.com'
15
+ self.post_install_message = 'PostInstall.txt' # TODO remove if post-install message not required
16
+ self.rubyforge_name = self.name # TODO this is default value
17
+ # self.extra_deps = [['activesupport','>= 2.0.2']]
18
+ self.extra_deps = [['eventmachine','>= 0.12.0']]
19
+
20
+
21
+ end
22
+
23
+ require 'newgem/tasks'
24
+ Dir['tasks/**/*.rake'].each { |t| load t }
25
+
26
+ # TODO - want other tests/tasks run by default? Add them to the list
27
+ # remove_task :default
28
+ # task :default => [:spec, :features]
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ source_base= File.join(File.dirname(File.dirname(__FILE__)),'skeleton')
4
+
5
+ if ARGV.count<2
6
+ puts "Tool to create a scbi_mapreduce demo application that you can use as a template"
7
+ puts
8
+ puts "Usage #{$0} app_name template"
9
+ puts
10
+ puts "E.g.: #{$0} my_app simple"
11
+ puts
12
+ puts "====== AVAILABLE TEMPLATES ======"
13
+
14
+ s=`ls #{source_base}`
15
+ puts s
16
+
17
+ exit
18
+ end
19
+
20
+ app_name = ARGV[0]
21
+ template = ARGV[1]
22
+
23
+ if File.exists?(app_name)
24
+ puts "#{app_name} already exists, aborting"
25
+ exit -1
26
+ end
27
+
28
+ source_base= File.join(source_base,template)
29
+
30
+ files=['main.rb','my_worker.rb','my_worker_manager.rb']
31
+
32
+ puts "Creating scbi_mapreduce application: #{app_name}"
33
+ puts
34
+ puts "Creating files:"
35
+ puts "="*20
36
+ system("cp -r #{source_base} #{app_name}")
37
+
38
+ # puts files
39
+ s=`find #{app_name}`
40
+ puts s
41
+
42
+ description_file=File.join(source_base,'README.txt')
43
+
44
+ if File.exists?(description_file)
45
+ puts
46
+ puts File.read(description_file)
47
+ end
48
+
49
+ # files.each do |file|
50
+ # puts "Creating file: #{file}"
51
+ # system("cp -r #{File.join(source_base,file)} #{app_name}")
52
+ # end