parallel-forkmanager 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,460 @@
1
+ # Parallel::ForkManager -- A simple parallel processing fork manager.
2
+ #
3
+ #
4
+ # Copyright (c) 2008 Nathan Patwardhan
5
+ #
6
+ # Author: Nathan Patwardhan <noopy.org@gmail.com>
7
+ #
8
+ # Documentation: Nathan Patwardhan <noopy.org@gmail.com>, based on Perl Parallel::ForkManager documentation by Noah Robin <sitz@onastick.net> and dlux <dlux@kapu.hu>.
9
+ #
10
+ # Credits (for original Perl implementation):
11
+ # - Chuck Hirstius <chirstius@megapathdsl.net> (callback exit status, original Perl example)
12
+ # - Grant Hopwood <hopwoodg@valero.com> (win32 port)
13
+ # - Mark Southern <mark_southern@merck.com> (bugfix)
14
+ #
15
+ # Credits (Ruby port):
16
+ # - Robert Klemme <shortcutter@googlemail.com> (clarification on Ruby lambda)
17
+ # - David A. Black <dblack@rubypal.com> (clarification on Ruby lambda)
18
+ # - Roger Pack <rogerdpack@gmail.com> (bugfix)
19
+ #
20
+ # == Overview
21
+ #
22
+ # Parallel::ForkManager is used for operations that you would like to do in parallel
23
+ # (e.g. downloading a bunch of web content simultaneously) but would prefer to use
24
+ # fork() instead of threads. Instead of managing child processes yourself Parallel::ForkManager
25
+ # handles the cleanup for you. Parallel::ForkManager also provides some nifty callbacks
26
+ # you can use at start and finish, or while you're waiting for child processes to complete.
27
+ #
28
+ # == Introduction
29
+ #
30
+ # If you've used fork() before, you're well aware that you need to be responsible
31
+ # for managing (i.e. cleaning up) the processes that were created as a result.
32
+ # Parallel::ForkManager handles this for you such that you start() and finish()
33
+ # a process without having to worry about child processes along the way.
34
+ #
35
+ # For instance you can use the following code to grab a list of webpages in
36
+ # parallel using Net::HTTP -- and store the output in files.
37
+ #
38
+ # == Example
39
+ #
40
+ # #!/usr/bin/env ruby
41
+ #
42
+ # require 'net/http'
43
+ # require 'Parallel/ForkManager'
44
+ #
45
+ # save_dir = '/tmp'
46
+ #
47
+ # my_urls = [
48
+ # 'http://www.cnn.com/index.html',
49
+ # 'http://www.oreilly.com/index.html',
50
+ # 'http://www.cakewalk.com/index.html',
51
+ # 'http://www.asdfsemicolonl.kj/index.htm'
52
+ # ]
53
+ #
54
+ # max_proc = 20
55
+ # pfm = Parallel::ForkManager.new(max_proc)
56
+ #
57
+ # pfm.run_on_finish(
58
+ # lambda {
59
+ # |pid,exit_code,ident|
60
+ # print "** PID (#{pid}) for #{ident} exited with code #{exit_code}!\n"
61
+ # }
62
+ # )
63
+ #
64
+ # for my_url in my_urls
65
+ # pfm.start(my_url) and next
66
+ #
67
+ # url = URI.parse(my_url)
68
+ #
69
+ # begin
70
+ # req = Net::HTTP::Get.new(url.path)
71
+ # res = Net::HTTP.start(url.host, url.port) {|http|
72
+ # http.request(req)
73
+ # }
74
+ # rescue
75
+ # pfm.finish(255)
76
+ # end
77
+ #
78
+ # status = res.code
79
+ # out_file = save_dir + '/' + url.host + '.txt';
80
+ #
81
+ # if status.to_i == 200
82
+ # f = File.open(out_file, 'w')
83
+ # f.print res.body
84
+ # f.close()
85
+ # pfm.finish(0)
86
+ # else
87
+ # pfm.finish(255)
88
+ # end
89
+ # end
90
+ #
91
+ # pfm.wait_all_children()
92
+ #
93
+ # First you need to instantiate the ForkManager with the "new" constructor.
94
+ # You must specify the maximum number of processes to be created. If you
95
+ # specify 0, then NO fork will be done; this is good for debugging purposes.
96
+ #
97
+ # Next, use pfm.start() to do the fork. pfm returns 0 for the child process,
98
+ # and child pid for the parent process. The "and next" skips the internal
99
+ # loop in the parent process.
100
+ #
101
+ # - pm.start() dies if the fork fails.
102
+ #
103
+ # - pfm.finish() terminates the child process (assuming a fork was done in the "start").
104
+ #
105
+ # - You cannot use pfm.start() if you are already in the child process.
106
+ # If you want to manage another set of subprocesses in the child process,
107
+ # you must instantiate another Parallel::ForkManager object!
108
+ #
109
+ # == Bugs and Limitations
110
+ #
111
+ # Parallel::ForkManager is a Ruby-centric rebase of Perl Parallel::ForkManager 0.7.5.
112
+ # While much of the original code was rewritten such that ForkManager worked in the "Ruby way",
113
+ # you might find some "warts" due to inconsistencies between Ruby and the original Perl code.
114
+ #
115
+ # Do not use Parallel::ForkManager in an environment where other child
116
+ # processes can affect the run of the main program, so using this module
117
+ # is not recommended in an environment where fork() / wait() is already used.
118
+ #
119
+ # If you want to use more than one copy of the Parallel::ForkManager then
120
+ # you have to make sure that all children processes are terminated -- before you
121
+ # use the second object in the main program.
122
+ #
123
+ # You are free to use a new copy of Parallel::ForkManager in the child
124
+ # processes, although I don't think it makes sense.
125
+ #
126
+
127
+ module Parallel
128
+
129
+ class ForkManager
130
+ VERSION = '1.0.1' # $Revision: 1.2 $
131
+
132
+ # Set debug to 1 for debugging messages.
133
+ attr_accessor :debug
134
+ attr_accessor :max_proc, :processes, :in_child, :on_wait_period
135
+ attr_accessor :do_on_start, :do_on_finish, :do_on_wait
136
+
137
+ def initialize(procs)
138
+ @debug = 0
139
+ @max_proc = procs
140
+ @processes = {}
141
+ @do_on_finish = {}
142
+ @in_child = 0
143
+
144
+ if self.debug == 1
145
+ print "in initialize #{max_proc}!\n"
146
+ end
147
+ end
148
+
149
+ #
150
+ # start("string") -- "string" identification is optional.
151
+ #
152
+ # start("string") "puts the fork in Parallel::ForkManager" -- as start() does
153
+ # the fork().
154
+ #
155
+ # start("string") takes an optional "string" argument to
156
+ # use as a process identifier. It is used by
157
+ # the "run_on_finish" callback for identifying the finished
158
+ # process. See run_on_finish() for more information.
159
+ #
160
+ # Return: PID of child process if in parent, or 0 if in the
161
+ # child process.
162
+
163
+ def start(identification=nil)
164
+ if self.in_child == 1
165
+ puts "Cannot start another process while you are in the child process"
166
+ exit 1
167
+ end
168
+
169
+ while(self.processes.length() >= self.max_proc)
170
+ self.on_wait()
171
+ if defined? self.on_wait_period
172
+ arg = Process::WNOHANG
173
+ else
174
+ arg = nil
175
+ end
176
+ self.wait_one_child(arg)
177
+ end
178
+
179
+ self.wait_children()
180
+
181
+ if self.max_proc
182
+ pid = fork()
183
+ if ! defined? pid
184
+ print "Cannot fork #{$!}\n"
185
+ exit 1
186
+ end
187
+
188
+ if pid != nil
189
+ self.processes[pid] = identification
190
+ self.on_start(pid, identification)
191
+ else
192
+ if ! pid
193
+ self.in_child = 1
194
+ end
195
+ end
196
+ return pid
197
+ else
198
+ self.processes[$$] = identification
199
+ self.on_start($$, identification)
200
+ return 0
201
+ end
202
+ end
203
+
204
+ #
205
+ # finish(exit_code) -- exit_code is optional
206
+ #
207
+ # finish() loses the child process by exiting and accepts an optional exit code.
208
+ # Default exit code is 0 and can be retrieved in the parent via callback.
209
+ # If you're running the program in debug mode (max_proc == 0), this method
210
+ # doesn't do anything.
211
+ #
212
+ def finish(exit_code = 0)
213
+ if self.in_child == 1
214
+ exit exit_code || 0
215
+ end
216
+
217
+ if self.max_proc == 0
218
+ self.on_finish($$, exit_code, self.processes[$$], 0, 0)
219
+ self.processes.delete($$)
220
+ end
221
+
222
+ return 0
223
+ end
224
+
225
+ def wait_children()
226
+ return if self.processes.empty?
227
+
228
+ kid = nil # Should our default be nil?
229
+ loop do
230
+ kid = self.wait_one_child(Process::WNOHANG)
231
+ break if kid > 0 || kid < -1
232
+ end
233
+ end
234
+
235
+ alias :wait_childs :wait_children # compatibility
236
+
237
+ #
238
+ # Probably won't want to call this directly. Just let wait_all_children(...)
239
+ # make the call for you.
240
+ #
241
+ def wait_one_child(parent)
242
+ kid = nil
243
+ while true
244
+ # Call _NT_waitpid(...) if we're using a Windows or Java variant.
245
+ if(RUBY_PLATFORM =~ /mswin|mingw|bccwin|wince|emx|java/)
246
+ kid = self._NT_waitpid(-1, parent ||= 0)
247
+ else
248
+ kid = self._waitpid(-1, parent ||= 0)
249
+ end
250
+ last if kid == 0 or kid == -1 # Win32 returns negative PIDs
251
+ redo if ! self.processes.has_key?(kid)
252
+ id = self.processes.delete(kid)
253
+ self.on_finish(kid, $? >> 8, id, $? & 0x7f, $? & 0x80 ? 1 : 0)
254
+ break
255
+ end
256
+
257
+ kid
258
+ end
259
+
260
+ #
261
+ # wait_all_children() will wait for all the processes which have been
262
+ # forked. This is a blocking wait.
263
+ #
264
+ def wait_all_children()
265
+ while ! self.processes.empty?
266
+ self.on_wait()
267
+ if defined? self.on_wait_period
268
+ arg = Process::WNOHANG
269
+ else
270
+ arg = nil
271
+ end
272
+ self.wait_one_child(arg)
273
+ end
274
+ end
275
+
276
+ alias :wait_all_childs :wait_all_children # compatibility
277
+
278
+ #
279
+ # You can define run_on_finish(...) that is called when a child in the parent
280
+ # process when a child is terminated.
281
+ #
282
+ # The parameters of run_on_finish(...) are:
283
+ #
284
+ # - pid of the process, which is terminated
285
+ # - exit code of the program
286
+ # - identification of the process (if provided in the "start" method)
287
+ # - exit signal (0-127: signal name)
288
+ # - core dump (1 if there was core dump at exit)
289
+ #
290
+ # Example:
291
+ #
292
+ # pfm.run_on_finish(
293
+ # lambda {
294
+ # |pid,exit_code,ident|
295
+ # print "** PID (#{pid}) for #{ident} exited with code #{exit_code}!\n"
296
+ # }
297
+ # )
298
+ #
299
+ def run_on_finish(code, pid=0)
300
+ begin
301
+ self.do_on_finish[pid] = code
302
+ rescue
303
+ raise "couldn't run on finish!\n"
304
+ end
305
+ end
306
+
307
+ def on_finish(*params)
308
+ pid = params[0]
309
+ code = self.do_on_finish[pid] || self.do_on_finish[0] or return 0
310
+ begin
311
+ my_argc = code.arity - 1
312
+ if my_argc > 0
313
+ my_params = params[0 .. my_argc]
314
+ else
315
+ my_params = [params[0]]
316
+ end
317
+ params = my_params
318
+ code.call(*params)
319
+ rescue
320
+ raise "on finish failed!\n"
321
+ end
322
+ end
323
+
324
+ #
325
+ # You can define a subroutine which is called when the child process needs
326
+ # to wait for the startup. If period is not defined, then one call is done per
327
+ # child. If period is defined, then code is called periodically and the
328
+ # method waits for "period" seconds betwen the two calls. Note, period can be
329
+ # fractional number also. The exact "period seconds" is not guaranteed,
330
+ # signals can shorten and the process scheduler can make it longer (i.e. on
331
+ # busy systems).
332
+ #
333
+ # No parameters are passed to code on the call.
334
+ #
335
+ # Example:
336
+ # timeout = 0.5
337
+ # pfm.run_on_wait(
338
+ # lambda {
339
+ # print "** Have to wait for one child ...\n"
340
+ # },
341
+ # timeout
342
+ # )
343
+ #
344
+ def run_on_wait(code, period)
345
+ self.do_on_wait = code
346
+ self.on_wait_period = period
347
+ end
348
+
349
+ def on_wait()
350
+ begin
351
+ if self.do_on_wait.class().name == 'Proc'
352
+ self.do_on_wait.call()
353
+ if defined? self.on_wait_period
354
+ #
355
+ # Unfortunately Ruby 1.8 has no concept of 'sigaction',
356
+ # so we're unable to check if a signal handler has
357
+ # already been installed for a given signal. In this
358
+ # case it's no matter, since we define handler, but yikes.
359
+ #
360
+ Signal.trap("CHLD") do
361
+ lambda{}.call()
362
+ end
363
+ IO.select(nil, nil, nil, self.on_wait_period)
364
+ end
365
+ end
366
+ end
367
+ end
368
+
369
+ #
370
+ # You can define a subroutine which is called when a child is started. It is
371
+ # called after a successful startup of a child in the parent process.
372
+ #
373
+ # The parameters of code are as follows:
374
+ # - pid of the process which has been started
375
+ # - identification of the process (if provided in the "start" method)
376
+ #
377
+ # Example:
378
+ #
379
+ # pfm.run_on_start(
380
+ # lambda {
381
+ # |pid,ident|
382
+ # print "run on start ::: #{ident} (#{pid})\n"
383
+ # }
384
+ # )
385
+ #
386
+ def run_on_start(code)
387
+ begin
388
+ self.do_on_start = code
389
+ rescue
390
+ raise "run on start failed!\n"
391
+ end
392
+ end
393
+
394
+ def on_start(*params)
395
+ begin
396
+ if self.do_on_start.class().name == 'Proc'
397
+ my_argc = self.do_on_start.arity - 1
398
+ if my_argc > 0
399
+ my_params = params[0 .. my_argc]
400
+ else
401
+ my_params = params[0]
402
+ end
403
+ params = my_params
404
+ self.do_on_start.call(*params)
405
+ end
406
+ rescue
407
+ raise "on_start failed\n"
408
+ end
409
+ end
410
+
411
+ #
412
+ # set_max_procs(mp) -- mp is an integer
413
+ #
414
+ # set_max_procs() allows you to set a new maximum number of children to maintain.
415
+ #
416
+ # Return: The previous setting of max_procs.
417
+ #
418
+ def set_max_procs(mp=nil)
419
+ if mp == nil
420
+ return self.max_proc
421
+ else
422
+ self.max_proc = mp
423
+ end
424
+ end
425
+
426
+ #
427
+ # _waitpid(...) should not be called directly as it is called automatically by
428
+ # wait_one_child(...).
429
+ #
430
+ def _waitpid(pid, flags)
431
+ return Process.waitpid(pid, flags)
432
+ end
433
+
434
+ #
435
+ # _NT_waitpid(...) is the Windows variant of _waitpid(...) and will be called
436
+ # automatically by wait_one_child(...) depending on the value of RUBY_PLATFORM.
437
+ # You should not call _NT_waitpid(...) directly.
438
+ #
439
+ def _NT_waitpid(pid, par)
440
+ if par == Process::WNOHANG
441
+ pids = self.processes.keys()
442
+ if pids.length() == 0
443
+ return -1
444
+ end
445
+
446
+ kid = 0
447
+ for my_pid in pids
448
+ kid = Process.waitpid(my_pid, par)
449
+ if kid != 0
450
+ return kid
451
+ end
452
+ return kid
453
+ end
454
+ else
455
+ return Process.waitpid(pid, par)
456
+ end
457
+ end
458
+ end
459
+
460
+ end
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'net/http'
4
+ require 'lib/parallel/forkmanager'
5
+
6
+ save_dir = '/tmp'
7
+
8
+ my_urls = [
9
+ 'http://www.cnn.com/index.html',
10
+ 'http://www.oreilly.com/index.html',
11
+ 'http://www.cakewalk.com/index.html',
12
+ 'http://www.asdfsemicolonl.kj/index.htm'
13
+ ]
14
+
15
+ max_proc = 20
16
+ pfm = Parallel::ForkManager.new(max_proc)
17
+
18
+ pfm.run_on_finish(
19
+ lambda {
20
+ |pid,exit_code,ident|
21
+ print "** PID (#{pid}) for #{ident} exited with code #{exit_code}!\n"
22
+ }
23
+ )
24
+
25
+ for my_url in my_urls
26
+ pfm.start(my_url) and next
27
+
28
+ url = URI.parse(my_url)
29
+
30
+ begin
31
+ req = Net::HTTP::Get.new(url.path)
32
+ res = Net::HTTP.start(url.host, url.port) {|http|
33
+ http.request(req)
34
+ }
35
+ rescue
36
+ pfm.finish(255)
37
+ end
38
+
39
+ status = res.code
40
+ out_file = save_dir + '/' + url.host + '.txt';
41
+
42
+ if status.to_i == 200
43
+ f = File.open(out_file, 'w')
44
+ f.print res.body
45
+ f.close()
46
+ pfm.finish(0)
47
+ else
48
+ pfm.finish(255)
49
+ end
50
+ end
51
+
52
+ pfm.wait_all_children()
53
+
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "lib/parallel/forkmanager"
4
+
5
+ num_procs = 20
6
+ pfm = Parallel::ForkManager.new(num_procs)
7
+
8
+ items = 1..10
9
+
10
+ pfm.run_on_start(
11
+ lambda {
12
+ |pid,ident|
13
+ print "run on start ::: #{ident} (#{pid})\n"
14
+ }
15
+ )
16
+
17
+ pfm.run_on_finish(
18
+ lambda {
19
+ |pid,exit_code,ident|
20
+ print " on_finish: ** PID: #{pid} EXIT: #{exit_code} IDENT: #{ident}\n"
21
+ }
22
+ )
23
+
24
+ timeout = 0.5
25
+ pfm.run_on_wait(
26
+ lambda {
27
+ print "** Have to wait for one child ...\n"
28
+ },
29
+ timeout
30
+ )
31
+
32
+ for item in items
33
+ my_item = 'nate-' + item.to_s
34
+ pid = pfm.start(my_item) and next
35
+
36
+ pfm.finish()
37
+ end
38
+
39
+ pfm.wait_all_children()
40
+
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: parallel-forkmanager
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nathan Patwardhan
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-10-24 00:00:00 -04:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: noopy.org @nospam@ gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/parallel/forkmanager.rb
26
+ - use_pfm.rb
27
+ - parallel_http_get.rb
28
+ has_rdoc: true
29
+ homepage: http://rubyforge.org/projects/parallelforkmgr/
30
+ post_install_message:
31
+ rdoc_options: []
32
+
33
+ require_paths:
34
+ - lib/parallel
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: "0"
40
+ version:
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ version:
47
+ requirements: []
48
+
49
+ rubyforge_project: parallelforkmgr
50
+ rubygems_version: 1.3.1
51
+ signing_key:
52
+ specification_version: 2
53
+ summary: A simple parallel processing fork manager.
54
+ test_files: []
55
+