parallel-forkmanager 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,460 @@
1
+ # Parallel::ForkManager -- A simple parallel processing fork manager.
2
+ #
3
+ #
4
+ # Copyright (c) 2008 Nathan Patwardhan
5
+ #
6
+ # Author: Nathan Patwardhan <noopy.org@gmail.com>
7
+ #
8
+ # Documentation: Nathan Patwardhan <noopy.org@gmail.com>, based on Perl Parallel::ForkManager documentation by Noah Robin <sitz@onastick.net> and dlux <dlux@kapu.hu>.
9
+ #
10
+ # Credits (for original Perl implementation):
11
+ # - Chuck Hirstius <chirstius@megapathdsl.net> (callback exit status, original Perl example)
12
+ # - Grant Hopwood <hopwoodg@valero.com> (win32 port)
13
+ # - Mark Southern <mark_southern@merck.com> (bugfix)
14
+ #
15
+ # Credits (Ruby port):
16
+ # - Robert Klemme <shortcutter@googlemail.com> (clarification on Ruby lambda)
17
+ # - David A. Black <dblack@rubypal.com> (clarification on Ruby lambda)
18
+ # - Roger Pack <rogerdpack@gmail.com> (bugfix)
19
+ #
20
+ # == Overview
21
+ #
22
+ # Parallel::ForkManager is used for operations that you would like to do in parallel
23
+ # (e.g. downloading a bunch of web content simultaneously) but would prefer to use
24
+ # fork() instead of threads. Instead of managing child processes yourself Parallel::ForkManager
25
+ # handles the cleanup for you. Parallel::ForkManager also provides some nifty callbacks
26
+ # you can use at start and finish, or while you're waiting for child processes to complete.
27
+ #
28
+ # == Introduction
29
+ #
30
+ # If you've used fork() before, you're well aware that you need to be responsible
31
+ # for managing (i.e. cleaning up) the processes that were created as a result.
32
+ # Parallel::ForkManager handles this for you such that you start() and finish()
33
+ # a process without having to worry about child processes along the way.
34
+ #
35
+ # For instance you can use the following code to grab a list of webpages in
36
+ # parallel using Net::HTTP -- and store the output in files.
37
+ #
38
+ # == Example
39
+ #
40
+ # #!/usr/bin/env ruby
41
+ #
42
+ # require 'net/http'
43
+ # require 'Parallel/ForkManager'
44
+ #
45
+ # save_dir = '/tmp'
46
+ #
47
+ # my_urls = [
48
+ # 'http://www.cnn.com/index.html',
49
+ # 'http://www.oreilly.com/index.html',
50
+ # 'http://www.cakewalk.com/index.html',
51
+ # 'http://www.asdfsemicolonl.kj/index.htm'
52
+ # ]
53
+ #
54
+ # max_proc = 20
55
+ # pfm = Parallel::ForkManager.new(max_proc)
56
+ #
57
+ # pfm.run_on_finish(
58
+ # lambda {
59
+ # |pid,exit_code,ident|
60
+ # print "** PID (#{pid}) for #{ident} exited with code #{exit_code}!\n"
61
+ # }
62
+ # )
63
+ #
64
+ # for my_url in my_urls
65
+ # pfm.start(my_url) and next
66
+ #
67
+ # url = URI.parse(my_url)
68
+ #
69
+ # begin
70
+ # req = Net::HTTP::Get.new(url.path)
71
+ # res = Net::HTTP.start(url.host, url.port) {|http|
72
+ # http.request(req)
73
+ # }
74
+ # rescue
75
+ # pfm.finish(255)
76
+ # end
77
+ #
78
+ # status = res.code
79
+ # out_file = save_dir + '/' + url.host + '.txt';
80
+ #
81
+ # if status.to_i == 200
82
+ # f = File.open(out_file, 'w')
83
+ # f.print res.body
84
+ # f.close()
85
+ # pfm.finish(0)
86
+ # else
87
+ # pfm.finish(255)
88
+ # end
89
+ # end
90
+ #
91
+ # pfm.wait_all_children()
92
+ #
93
+ # First you need to instantiate the ForkManager with the "new" constructor.
94
+ # You must specify the maximum number of processes to be created. If you
95
+ # specify 0, then NO fork will be done; this is good for debugging purposes.
96
+ #
97
+ # Next, use pfm.start() to do the fork. pfm returns 0 for the child process,
98
+ # and child pid for the parent process. The "and next" skips the internal
99
+ # loop in the parent process.
100
+ #
101
+ # - pm.start() dies if the fork fails.
102
+ #
103
+ # - pfm.finish() terminates the child process (assuming a fork was done in the "start").
104
+ #
105
+ # - You cannot use pfm.start() if you are already in the child process.
106
+ # If you want to manage another set of subprocesses in the child process,
107
+ # you must instantiate another Parallel::ForkManager object!
108
+ #
109
+ # == Bugs and Limitations
110
+ #
111
+ # Parallel::ForkManager is a Ruby-centric rebase of Perl Parallel::ForkManager 0.7.5.
112
+ # While much of the original code was rewritten such that ForkManager worked in the "Ruby way",
113
+ # you might find some "warts" due to inconsistencies between Ruby and the original Perl code.
114
+ #
115
+ # Do not use Parallel::ForkManager in an environment where other child
116
+ # processes can affect the run of the main program, so using this module
117
+ # is not recommended in an environment where fork() / wait() is already used.
118
+ #
119
+ # If you want to use more than one copy of the Parallel::ForkManager then
120
+ # you have to make sure that all children processes are terminated -- before you
121
+ # use the second object in the main program.
122
+ #
123
+ # You are free to use a new copy of Parallel::ForkManager in the child
124
+ # processes, although I don't think it makes sense.
125
+ #
126
+
127
+ module Parallel
128
+
129
+ class ForkManager
130
+ VERSION = '1.0.1' # $Revision: 1.2 $
131
+
132
+ # Set debug to 1 for debugging messages.
133
+ attr_accessor :debug
134
+ attr_accessor :max_proc, :processes, :in_child, :on_wait_period
135
+ attr_accessor :do_on_start, :do_on_finish, :do_on_wait
136
+
137
+ def initialize(procs)
138
+ @debug = 0
139
+ @max_proc = procs
140
+ @processes = {}
141
+ @do_on_finish = {}
142
+ @in_child = 0
143
+
144
+ if self.debug == 1
145
+ print "in initialize #{max_proc}!\n"
146
+ end
147
+ end
148
+
149
+ #
150
+ # start("string") -- "string" identification is optional.
151
+ #
152
+ # start("string") "puts the fork in Parallel::ForkManager" -- as start() does
153
+ # the fork().
154
+ #
155
+ # start("string") takes an optional "string" argument to
156
+ # use as a process identifier. It is used by
157
+ # the "run_on_finish" callback for identifying the finished
158
+ # process. See run_on_finish() for more information.
159
+ #
160
+ # Return: PID of child process if in parent, or 0 if in the
161
+ # child process.
162
+
163
+ def start(identification=nil)
164
+ if self.in_child == 1
165
+ puts "Cannot start another process while you are in the child process"
166
+ exit 1
167
+ end
168
+
169
+ while(self.processes.length() >= self.max_proc)
170
+ self.on_wait()
171
+ if defined? self.on_wait_period
172
+ arg = Process::WNOHANG
173
+ else
174
+ arg = nil
175
+ end
176
+ self.wait_one_child(arg)
177
+ end
178
+
179
+ self.wait_children()
180
+
181
+ if self.max_proc
182
+ pid = fork()
183
+ if ! defined? pid
184
+ print "Cannot fork #{$!}\n"
185
+ exit 1
186
+ end
187
+
188
+ if pid != nil
189
+ self.processes[pid] = identification
190
+ self.on_start(pid, identification)
191
+ else
192
+ if ! pid
193
+ self.in_child = 1
194
+ end
195
+ end
196
+ return pid
197
+ else
198
+ self.processes[$$] = identification
199
+ self.on_start($$, identification)
200
+ return 0
201
+ end
202
+ end
203
+
204
+ #
205
+ # finish(exit_code) -- exit_code is optional
206
+ #
207
+ # finish() loses the child process by exiting and accepts an optional exit code.
208
+ # Default exit code is 0 and can be retrieved in the parent via callback.
209
+ # If you're running the program in debug mode (max_proc == 0), this method
210
+ # doesn't do anything.
211
+ #
212
+ def finish(exit_code = 0)
213
+ if self.in_child == 1
214
+ exit exit_code || 0
215
+ end
216
+
217
+ if self.max_proc == 0
218
+ self.on_finish($$, exit_code, self.processes[$$], 0, 0)
219
+ self.processes.delete($$)
220
+ end
221
+
222
+ return 0
223
+ end
224
+
225
+ def wait_children()
226
+ return if self.processes.empty?
227
+
228
+ kid = nil # Should our default be nil?
229
+ loop do
230
+ kid = self.wait_one_child(Process::WNOHANG)
231
+ break if kid > 0 || kid < -1
232
+ end
233
+ end
234
+
235
+ alias :wait_childs :wait_children # compatibility
236
+
237
+ #
238
+ # Probably won't want to call this directly. Just let wait_all_children(...)
239
+ # make the call for you.
240
+ #
241
+ def wait_one_child(parent)
242
+ kid = nil
243
+ while true
244
+ # Call _NT_waitpid(...) if we're using a Windows or Java variant.
245
+ if(RUBY_PLATFORM =~ /mswin|mingw|bccwin|wince|emx|java/)
246
+ kid = self._NT_waitpid(-1, parent ||= 0)
247
+ else
248
+ kid = self._waitpid(-1, parent ||= 0)
249
+ end
250
+ last if kid == 0 or kid == -1 # Win32 returns negative PIDs
251
+ redo if ! self.processes.has_key?(kid)
252
+ id = self.processes.delete(kid)
253
+ self.on_finish(kid, $? >> 8, id, $? & 0x7f, $? & 0x80 ? 1 : 0)
254
+ break
255
+ end
256
+
257
+ kid
258
+ end
259
+
260
+ #
261
+ # wait_all_children() will wait for all the processes which have been
262
+ # forked. This is a blocking wait.
263
+ #
264
+ def wait_all_children()
265
+ while ! self.processes.empty?
266
+ self.on_wait()
267
+ if defined? self.on_wait_period
268
+ arg = Process::WNOHANG
269
+ else
270
+ arg = nil
271
+ end
272
+ self.wait_one_child(arg)
273
+ end
274
+ end
275
+
276
+ alias :wait_all_childs :wait_all_children # compatibility
277
+
278
+ #
279
+ # You can define run_on_finish(...) that is called when a child in the parent
280
+ # process when a child is terminated.
281
+ #
282
+ # The parameters of run_on_finish(...) are:
283
+ #
284
+ # - pid of the process, which is terminated
285
+ # - exit code of the program
286
+ # - identification of the process (if provided in the "start" method)
287
+ # - exit signal (0-127: signal name)
288
+ # - core dump (1 if there was core dump at exit)
289
+ #
290
+ # Example:
291
+ #
292
+ # pfm.run_on_finish(
293
+ # lambda {
294
+ # |pid,exit_code,ident|
295
+ # print "** PID (#{pid}) for #{ident} exited with code #{exit_code}!\n"
296
+ # }
297
+ # )
298
+ #
299
+ def run_on_finish(code, pid=0)
300
+ begin
301
+ self.do_on_finish[pid] = code
302
+ rescue
303
+ raise "couldn't run on finish!\n"
304
+ end
305
+ end
306
+
307
+ def on_finish(*params)
308
+ pid = params[0]
309
+ code = self.do_on_finish[pid] || self.do_on_finish[0] or return 0
310
+ begin
311
+ my_argc = code.arity - 1
312
+ if my_argc > 0
313
+ my_params = params[0 .. my_argc]
314
+ else
315
+ my_params = [params[0]]
316
+ end
317
+ params = my_params
318
+ code.call(*params)
319
+ rescue
320
+ raise "on finish failed!\n"
321
+ end
322
+ end
323
+
324
+ #
325
+ # You can define a subroutine which is called when the child process needs
326
+ # to wait for the startup. If period is not defined, then one call is done per
327
+ # child. If period is defined, then code is called periodically and the
328
+ # method waits for "period" seconds betwen the two calls. Note, period can be
329
+ # fractional number also. The exact "period seconds" is not guaranteed,
330
+ # signals can shorten and the process scheduler can make it longer (i.e. on
331
+ # busy systems).
332
+ #
333
+ # No parameters are passed to code on the call.
334
+ #
335
+ # Example:
336
+ # timeout = 0.5
337
+ # pfm.run_on_wait(
338
+ # lambda {
339
+ # print "** Have to wait for one child ...\n"
340
+ # },
341
+ # timeout
342
+ # )
343
+ #
344
+ def run_on_wait(code, period)
345
+ self.do_on_wait = code
346
+ self.on_wait_period = period
347
+ end
348
+
349
+ def on_wait()
350
+ begin
351
+ if self.do_on_wait.class().name == 'Proc'
352
+ self.do_on_wait.call()
353
+ if defined? self.on_wait_period
354
+ #
355
+ # Unfortunately Ruby 1.8 has no concept of 'sigaction',
356
+ # so we're unable to check if a signal handler has
357
+ # already been installed for a given signal. In this
358
+ # case it's no matter, since we define handler, but yikes.
359
+ #
360
+ Signal.trap("CHLD") do
361
+ lambda{}.call()
362
+ end
363
+ IO.select(nil, nil, nil, self.on_wait_period)
364
+ end
365
+ end
366
+ end
367
+ end
368
+
369
+ #
370
+ # You can define a subroutine which is called when a child is started. It is
371
+ # called after a successful startup of a child in the parent process.
372
+ #
373
+ # The parameters of code are as follows:
374
+ # - pid of the process which has been started
375
+ # - identification of the process (if provided in the "start" method)
376
+ #
377
+ # Example:
378
+ #
379
+ # pfm.run_on_start(
380
+ # lambda {
381
+ # |pid,ident|
382
+ # print "run on start ::: #{ident} (#{pid})\n"
383
+ # }
384
+ # )
385
+ #
386
+ def run_on_start(code)
387
+ begin
388
+ self.do_on_start = code
389
+ rescue
390
+ raise "run on start failed!\n"
391
+ end
392
+ end
393
+
394
+ def on_start(*params)
395
+ begin
396
+ if self.do_on_start.class().name == 'Proc'
397
+ my_argc = self.do_on_start.arity - 1
398
+ if my_argc > 0
399
+ my_params = params[0 .. my_argc]
400
+ else
401
+ my_params = params[0]
402
+ end
403
+ params = my_params
404
+ self.do_on_start.call(*params)
405
+ end
406
+ rescue
407
+ raise "on_start failed\n"
408
+ end
409
+ end
410
+
411
+ #
412
+ # set_max_procs(mp) -- mp is an integer
413
+ #
414
+ # set_max_procs() allows you to set a new maximum number of children to maintain.
415
+ #
416
+ # Return: The previous setting of max_procs.
417
+ #
418
+ def set_max_procs(mp=nil)
419
+ if mp == nil
420
+ return self.max_proc
421
+ else
422
+ self.max_proc = mp
423
+ end
424
+ end
425
+
426
+ #
427
+ # _waitpid(...) should not be called directly as it is called automatically by
428
+ # wait_one_child(...).
429
+ #
430
+ def _waitpid(pid, flags)
431
+ return Process.waitpid(pid, flags)
432
+ end
433
+
434
+ #
435
+ # _NT_waitpid(...) is the Windows variant of _waitpid(...) and will be called
436
+ # automatically by wait_one_child(...) depending on the value of RUBY_PLATFORM.
437
+ # You should not call _NT_waitpid(...) directly.
438
+ #
439
+ def _NT_waitpid(pid, par)
440
+ if par == Process::WNOHANG
441
+ pids = self.processes.keys()
442
+ if pids.length() == 0
443
+ return -1
444
+ end
445
+
446
+ kid = 0
447
+ for my_pid in pids
448
+ kid = Process.waitpid(my_pid, par)
449
+ if kid != 0
450
+ return kid
451
+ end
452
+ return kid
453
+ end
454
+ else
455
+ return Process.waitpid(pid, par)
456
+ end
457
+ end
458
+ end
459
+
460
+ end
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'net/http'
4
+ require 'lib/parallel/forkmanager'
5
+
6
+ save_dir = '/tmp'
7
+
8
+ my_urls = [
9
+ 'http://www.cnn.com/index.html',
10
+ 'http://www.oreilly.com/index.html',
11
+ 'http://www.cakewalk.com/index.html',
12
+ 'http://www.asdfsemicolonl.kj/index.htm'
13
+ ]
14
+
15
+ max_proc = 20
16
+ pfm = Parallel::ForkManager.new(max_proc)
17
+
18
+ pfm.run_on_finish(
19
+ lambda {
20
+ |pid,exit_code,ident|
21
+ print "** PID (#{pid}) for #{ident} exited with code #{exit_code}!\n"
22
+ }
23
+ )
24
+
25
+ for my_url in my_urls
26
+ pfm.start(my_url) and next
27
+
28
+ url = URI.parse(my_url)
29
+
30
+ begin
31
+ req = Net::HTTP::Get.new(url.path)
32
+ res = Net::HTTP.start(url.host, url.port) {|http|
33
+ http.request(req)
34
+ }
35
+ rescue
36
+ pfm.finish(255)
37
+ end
38
+
39
+ status = res.code
40
+ out_file = save_dir + '/' + url.host + '.txt';
41
+
42
+ if status.to_i == 200
43
+ f = File.open(out_file, 'w')
44
+ f.print res.body
45
+ f.close()
46
+ pfm.finish(0)
47
+ else
48
+ pfm.finish(255)
49
+ end
50
+ end
51
+
52
+ pfm.wait_all_children()
53
+
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "lib/parallel/forkmanager"
4
+
5
+ num_procs = 20
6
+ pfm = Parallel::ForkManager.new(num_procs)
7
+
8
+ items = 1..10
9
+
10
+ pfm.run_on_start(
11
+ lambda {
12
+ |pid,ident|
13
+ print "run on start ::: #{ident} (#{pid})\n"
14
+ }
15
+ )
16
+
17
+ pfm.run_on_finish(
18
+ lambda {
19
+ |pid,exit_code,ident|
20
+ print " on_finish: ** PID: #{pid} EXIT: #{exit_code} IDENT: #{ident}\n"
21
+ }
22
+ )
23
+
24
+ timeout = 0.5
25
+ pfm.run_on_wait(
26
+ lambda {
27
+ print "** Have to wait for one child ...\n"
28
+ },
29
+ timeout
30
+ )
31
+
32
+ for item in items
33
+ my_item = 'nate-' + item.to_s
34
+ pid = pfm.start(my_item) and next
35
+
36
+ pfm.finish()
37
+ end
38
+
39
+ pfm.wait_all_children()
40
+
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parallel-forkmanager
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nathan Patwardhan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-10-24 00:00:00 -04:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: noopy.org @nospam@ gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/parallel/forkmanager.rb
26
+ - use_pfm.rb
27
+ - parallel_http_get.rb
28
+ has_rdoc: true
29
+ homepage: http://rubyforge.org/projects/parallelforkmgr/
30
+ post_install_message:
31
+ rdoc_options: []
32
+
33
+ require_paths:
34
+ - lib/parallel
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: "0"
40
+ version:
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ version:
47
+ requirements: []
48
+
49
+ rubyforge_project: parallelforkmgr
50
+ rubygems_version: 1.3.1
51
+ signing_key:
52
+ specification_version: 2
53
+ summary: A simple parallel processing fork manager.
54
+ test_files: []
55
+